In [23]:
from zipfile import ZipFile
file_name = '/content/Text.zip'

with ZipFile(file_name, 'r') as zip_file:
  zip_file.extractall()
  print('Done')

Done


In [24]:
!pip -q install sentence-transformers faiss-cpu tqdm

In [25]:
DATA_DIR = Path("/content/Dataset/Sherlock Holmes")
txt_files = sorted(DATA_DIR.rglob("*.txt"))
print(len(txt_files), "text files found")
print(txt_files[:5])

60 text files found
[PosixPath('/content/Dataset/Sherlock Holmes/Books/A Study in Scarlet.txt'), PosixPath('/content/Dataset/Sherlock Holmes/Books/The Hound of Baskervilles.txt'), PosixPath('/content/Dataset/Sherlock Holmes/Books/The Sign of Four.txt'), PosixPath('/content/Dataset/Sherlock Holmes/Books/The Valley of Fear.txt'), PosixPath('/content/Dataset/Sherlock Holmes/Short Stories/His Last Bow/01 The Adventure of Wisteria Lodge.txt')]


In [26]:
from pathlib import Path
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab', quiet=True)

data_dir = Path("/content/Dataset/Sherlock Holmes")
novels = ['A Study in Scarlet', 'The Hound of Baskervilles', 'The Sign of Four', 'The Valley of Fear']

docs = []
for txt_file in sorted(data_dir.rglob("*.txt")):
    title = txt_file.stem
    try:
        text = txt_file.read_text(encoding='utf-8')
    except UnicodeDecodeError:
        print(f"Can't read {txt_file}")
        continue

    sentences = sent_tokenize(text)
    is_novel = title in novels

    for sent in sentences:
        sent = sent.strip()
        if len(sent) > 15:
            docs.append({
                'text': sent,
                'title': title,
                'type': 'novel' if is_novel else 'story',
                'folder': txt_file.parent.name
            })

print(f"Got {len(docs)} sentences")

Got 35072 sentences


In [5]:
!pip install sentence-transformers pinecone nltk

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging>=20.9 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Downloading packaging-24.2-py

**Generate embeddings**

In [27]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

for i, doc in enumerate(docs):
    doc['id'] = f"{doc['title']}_{i}"

embeddings = model.encode([doc['text'] for doc in docs], show_progress_bar=True, batch_size=32)
print(f"Generated {len(embeddings)} embeddings.")


Batches:   0%|          | 0/1096 [00:00<?, ?it/s]

Generated 35072 embeddings.


**Index in Pinecone**

In [None]:
import pinecone
from pinecone import ServerlessSpec
from google.colab import userdata
import re

API_KEY = userdata.get('PINECONE_API_KEY')

pc = pinecone.Pinecone(API_KEY)
index_name = 'sherlock-semantic-search'
dimension = 384

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

pc.create_index(
    name=index_name,
    dimension=dimension,
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

index = pc.Index(index_name)
batch_size = 10

for i in range(0, len(docs), batch_size):
    batch_docs = docs[i:i + batch_size]
    batch_embeds = embeddings[i:i + batch_size]
    upserts = []

    for j, (doc, emb) in enumerate(zip(batch_docs, batch_embeds)):
        unique_id = f"{doc['title']}_{i + j}"
        ascii_id = re.sub(r'[^\x00-\x7F]+', '_', unique_id)

        upserts.append({
            'id': ascii_id,
            'values': emb.tolist(),
            'metadata': {
                'title': doc['title'],
                'type': doc['type'],
                'folder': doc['folder']
            }
        })
    index.upsert(vectors=upserts)

print("Indexing complete!")

** Semantic search function**

In [22]:
def search_docs(query, k=5):
    vec = model.encode(query).tolist()
    results = index.query(vector=vec, top_k=k*2, include_metadata=True)

    seen = {}
    for match in results['matches']:
        title = match['metadata']['title']
        if title not in seen or match['score'] > seen[title][1]:
            seen[title] = (match['metadata']['folder'], match['score'])

    return sorted(seen.items(), key=lambda x: x[1][1], reverse=True)[:k]

# Test
search_query = "mystery novel with detective"
matches = find_similar_docs(search_query)

if matches:
    print(f"Found {len(matches)} relevant documents:")
    for doc_title, location, similarity in matches:
        print(f"  • {doc_title}")
        if location:
            print(f"    Location: {location}")
        print(f"    Similarity: {similarity:.3f}\n")
else:
    print("No relevant documents found.")



Found 5 relevant documents:
  • The Valley of Fear
    Location: Books
    Similarity: 0.594

  • A Study in Scarlet
    Location: Books
    Similarity: 0.576

  • 08 The Adventure of the Creeping Man
    Location: The Case-book of Sherlock Holmes
    Similarity: 0.525

  • 07 The Reigate Squires
    Location: The Memoirs of Sherlock Holmes
    Similarity: 0.520

  • 04 The Stockbroker’s Clerk
    Location: The Memoirs of Sherlock Holmes
    Similarity: 0.498

