In [1]:
pip install faiss-cpu sentence-transformers rank-bm25 tqdm nltk sqlite3 pickle5


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp39-cp39-win_amd64.whl (13.7 MB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement sqlite3 (from versions: none)
ERROR: No matching distribution found for sqlite3


In [3]:
import sqlite3
import numpy as np
import faiss
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# Load Sentence Transformer Model for Tamil-English
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# ✅ Step 1: Load Preprocessed Text Chunks from SQLite
def load_text_chunks():
    """Load all preprocessed text chunks from SQLite."""
    conn = sqlite3.connect("processed_texts_1.db")
    cursor = conn.cursor()
    cursor.execute("SELECT content FROM texts;")
    text_chunks = [row[0] for row in cursor.fetchall()]
    conn.close()
    return text_chunks

text_chunks = load_text_chunks()
print(f"✅ Loaded {len(text_chunks)} text chunks from database.")

# ✅ Step 2: Convert Text Chunks into Embeddings
print("🔄 Generating embeddings for text chunks...")
embeddings = model.encode(text_chunks, convert_to_numpy=True)
print(f"✅ Generated {len(embeddings)} embeddings of size {embeddings.shape[1]}.")

# ✅ Step 3: Index Embeddings Using FAISS
embedding_size = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_size)
index.add(embeddings)

# Save FAISS index to disk
faiss.write_index(index, "faiss_index.bin")
print("✅ FAISS index created and saved.")

# ✅ Step 4: Implement BM25 for Keyword Search
print("🔄 Building BM25 index...")
tokenized_corpus = [word_tokenize(text.lower()) for text in text_chunks]
bm25 = BM25Okapi(tokenized_corpus)
print("✅ BM25 keyword search index created.")

# ✅ Step 5: Implement Hybrid Search (BM25 + FAISS)
def hybrid_search(query, top_n=5):
    """Retrieve results using both FAISS and BM25 for best accuracy."""
    
    # Convert query into embedding
    query_embedding = model.encode([query], convert_to_numpy=True)

    # FAISS semantic search
    D, I = index.search(query_embedding, top_n)
    faiss_results = [text_chunks[i] for i in I[0]]

    # BM25 keyword search
    tokenized_query = word_tokenize(query.lower())
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[-top_n:][::-1]
    bm25_results = [text_chunks[i] for i in bm25_top_indices]

    # Merge results
    combined_results = list(set(faiss_results + bm25_results))

    return combined_results[:top_n]

# ✅ Step 6: Test Retrieval System
query = "தமிழக தொல்லியல் ஆய்வுகள்"  # Example Tamil query
results = hybrid_search(query, top_n=3)

print("\n🔍 Query Results:")
for i, result in enumerate(results, 1):
    print(f"{i}. {result[:200]}...")  # Print first 200 characters

# ✅ Step 7: Save FAISS & BM25 Models for Future Use
with open("bm25_model.pkl", "wb") as f:
    pickle.dump(bm25, f)

with open("text_chunks.pkl", "wb") as f:
    pickle.dump(text_chunks, f)

print("✅ FAISS, BM25, and text data saved.")

# ✅ Step 8: Function to Load Models in Future
def load_models():
    """Load FAISS index, BM25 model, and text chunks."""
    index = faiss.read_index("faiss_index.bin")

    with open("bm25_model.pkl", "rb") as f:
        bm25 = pickle.load(f)

    with open("text_chunks.pkl", "rb") as f:
        text_chunks = pickle.load(f)

    print("✅ FAISS & BM25 models loaded.")
    return index, bm25, text_chunks


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


model.safetensors:  58%|#####7    | 273M/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Loaded 28125 text chunks from database.
🔄 Generating embeddings for text chunks...
✅ Generated 28125 embeddings of size 384.
✅ FAISS index created and saved.
🔄 Building BM25 index...
✅ BM25 keyword search index created.

🔍 Query Results:
1. EARS ASSIS POST PREP SEL PSE SEL EAE தத SiLteegse Seep eee er pe PRIESTESS OS லய கதத SESS T ESTERS SS es eton terete lteter tires sever teass soe te tees a PPIT ELISE LOSES TNS ரதத par os தத LOSS SSIS...
2. நப ம ய LEMAR பப பமப மமம அமவ ஸ றகள க த க பங she ந ந ககக acts ட ந பயககக [ற ன த a னனர TA லவவ கம ம i 2 பங ந ப வ ர a ள he vs a 3 ay 3 ne ca my x pula பமடம த ள oat ra bi ie eae ங ப Besson ய தய 4" a rina eed...
3. ஸர வரதபடடர ஆணட வக எஞச கட[வப] பதவ[கக 2. ர தரவடடர வககக ப]டரய வடததடனன ஊ 3. 1] மல கறனன மனகட வர இலரக 94 த. ந. ௮. தலலயலதற தடர எண 509 2004 மவடடம கனனயககமர ஆடச ஆணட Soo வடடம கலகளம வரலறற ஆணட கலலம 919 க.ப. 1744 ஊ...
✅ FAISS, BM25, and text data saved.


In [2]:
pip install faiss-cpu


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp39-cp39-win_amd64.whl (13.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.




In [2]:
pip install sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp39-cp39-win_amd64.whl (162 kB)
Collecting fsspec>=2023.5.0
  Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Collecting safetensors>=0.4.1
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: pyyaml, fsspec, huggingface-hub, tokenizers, safetensors, transformers, sentence-transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
  Attempting uninstall: fsspec
    Found existing installation: f

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.4 requires pathlib, which is not installed.
chatterbot-corpus 1.2.0 requires PyYAML<4.0,>=3.12, but you have pyyaml 6.0.2 which is incompatible.


In [2]:
pip install rank-bm25


Collecting rank-bm25Note: you may need to restart the kernel to use updated packages.
  Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2





In [1]:
import sqlite3
import numpy as np
import faiss
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk

# Ensure required NLTK data is available
nltk.download("punkt")

# ✅ Load Pre-trained Multilingual Embedding Model
print("🔄 Loading Sentence Transformer Model...")
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# ✅ Step 1: Load Text Data from SQLite
def load_text_chunks(db_path="processed_texts_final_3.db"):
    """Load preprocessed text chunks from SQLite database."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT content FROM texts")
    text_chunks = [row[0] for row in cursor.fetchall()]
    conn.close()
    return text_chunks

print("✅ Loading text chunks from database...")
text_chunks = load_text_chunks()
print(f"✅ Loaded {len(text_chunks)} text chunks from database.")

# ✅ Step 2: Convert Text to Vector Embeddings
print("🔄 Generating embeddings for text chunks...")
embeddings = model.encode(text_chunks, convert_to_numpy=True, show_progress_bar=True)
print(f"✅ Generated {len(embeddings)} embeddings of size {embeddings.shape[1]}.")

# ✅ Step 3: Build FAISS Index
print("🔄 Creating FAISS index...")
embedding_dim = embeddings.shape[1]  # Get the embedding dimension (384)
index = faiss.IndexFlatL2(embedding_dim)  # L2 (Euclidean) distance-based index
index.add(embeddings)  # Add embeddings to FAISS index
faiss.write_index(index, "faiss_index.bin")  # Save FAISS index
print("✅ FAISS index created and saved.")

# ✅ Step 4: Build BM25 Index for Keyword Search
print("🔄 Building BM25 index...")
tokenized_corpus = [word_tokenize(text.lower()) for text in text_chunks]
bm25 = BM25Okapi(tokenized_corpus)  # Create BM25 index
with open("bm25_index.pkl", "wb") as f:
    pickle.dump(bm25, f)  # Save BM25 model
print("✅ BM25 keyword search index created.")

# ✅ Save Processed Text Chunks for Retrieval
with open("text_chunks.pkl", "wb") as f:
    pickle.dump(text_chunks, f)

print("✅ FAISS, BM25, and text data saved.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔄 Loading Sentence Transformer Model...
✅ Loading text chunks from database...
✅ Loaded 25184 text chunks from database.
🔄 Generating embeddings for text chunks...


Batches:   0%|          | 0/787 [00:00<?, ?it/s]

✅ Generated 25184 embeddings of size 384.
🔄 Creating FAISS index...
✅ FAISS index created and saved.
🔄 Building BM25 index...
✅ BM25 keyword search index created.
✅ FAISS, BM25, and text data saved.


In [2]:
import os

faiss_index_path = "faiss_index.bin"

# Check if the file exists and its size
if os.path.exists(faiss_index_path):
    print(f"✅ File exists: {faiss_index_path}")
    print(f"📦 File size: {os.path.getsize(faiss_index_path)} bytes")
else:
    print(f"❌ FAISS index file not found!")


✅ File exists: faiss_index.bin
📦 File size: 38682669 bytes


In [3]:
import faiss

faiss_index_path = "faiss_index.bin"

try:
    index = faiss.read_index(faiss_index_path)
    print(f"✅ FAISS index successfully loaded with {index.ntotal} vectors")
except Exception as e:
    print(f"❌ Error loading FAISS index: {e}")


✅ FAISS index successfully loaded with 25184 vectors


In [4]:
import numpy as np

# Example: Create a dummy query vector (replace with an actual embedding later)
query_vector = np.random.rand(1, index.d).astype(np.float32)

# Perform FAISS search
D, I = index.search(query_vector, 5)  # Retrieve top 5 results
print(f"🔍 Top 5 retrieved indexes: {I}")
print(f"🔢 Distances: {D}")


🔍 Top 5 retrieved indexes: [[14153  1186  1193  8366 18918]]
🔢 Distances: [[117.71161  117.820435 118.11324  118.19989  118.267395]]


In [5]:
import pickle
import numpy as np

# Load BM25 model
with open("bm25_index.pkl", "rb") as f:
    bm25 = pickle.load(f)

# Example query
query_text = "ancient Tamil inscriptions on Stone Pillar"
tokenized_query = query_text.lower().split()

# Perform BM25 search
bm25_scores = bm25.get_scores(tokenized_query)
top_bm25_results = np.argsort(bm25_scores)[::-1][:5]

print(f"🔍 Top 5 BM25 results: {top_bm25_results}")
print(f"🔢 BM25 Scores: {[bm25_scores[i] for i in top_bm25_results]}")


🔍 Top 5 BM25 results: [8736 1238  907 5783 5798]
🔢 BM25 Scores: [24.456559612229086, 19.928549824957127, 18.59563451309987, 18.51387805492081, 18.51387805492081]


In [6]:
import numpy as np

def hybrid_search(query_embedding, query_text, faiss_weight=0.7, bm25_weight=0.3):
    """Combine FAISS (semantic) + BM25 (keyword) search"""

    # FAISS search
    D, faiss_results = index.search(query_embedding, 5)
    
    # BM25 search
    tokenized_query = query_text.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_results = np.argsort(bm25_scores)[::-1][:5]

    # Combine results (weighted sum)
    hybrid_scores = {}
    
    for i, idx in enumerate(faiss_results[0]):
        hybrid_scores[idx] = faiss_weight * (1 / (D[0][i] + 1e-5))  # Avoid division by zero

    for i, idx in enumerate(bm25_results):
        if idx in hybrid_scores:
            hybrid_scores[idx] += bm25_weight * bm25_scores[idx]
        else:
            hybrid_scores[idx] = bm25_weight * bm25_scores[idx]

    # Sort by final scores
    sorted_results = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    
    return [r[0] for r in sorted_results[:5]]

# Example hybrid search
query_vector = np.random.rand(1, index.d).astype(np.float32)  # Replace with real query embedding
query_text = "ancient Tamil inscriptions on Stone Pillar"

top_hybrid_results = hybrid_search(query_vector, query_text)
print(f"🔍 Hybrid Search Top Results: {top_hybrid_results}")


🔍 Hybrid Search Top Results: [8736, 1238, 907, 5783, 5798]


In [20]:
import torch
import pickle
import numpy as np
import faiss
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load FAISS, BM25, and Text Chunks
with open("bm25_index.pkl", "rb") as f:
    bm25 = pickle.load(f)

with open("text_chunks.pkl", "rb") as f:
    text_chunks = pickle.load(f)

faiss_index = faiss.read_index("faiss_index.bin")

# Load SentenceTransformer (MiniLM multilingual)
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Load Flan-T5 for reranking and summarization (base, good on CPU)
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", resume_download=True)

# Hybrid Retriever (FAISS + BM25)
def hybrid_search(query_text, top_k=10, faiss_weight=0.7, bm25_weight=0.3):
    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True)

    # FAISS Search
    D, faiss_indices = faiss_index.search(query_embedding, top_k)
    faiss_scores = 1 / (D[0] + 1e-5)  # Similarity from L2 distance

    # BM25 Search
    tokenized_query = word_tokenize(query_text.lower())
    bm25_scores_all = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores_all)[::-1][:top_k]
    
    # Combine scores
    hybrid_scores = {}
    for i, idx in enumerate(faiss_indices[0]):
        hybrid_scores[idx] = faiss_weight * faiss_scores[i]

    for idx in bm25_top_indices:
        hybrid_scores[idx] = hybrid_scores.get(idx, 0) + bm25_weight * bm25_scores_all[idx]

    # Sort and return top-k
    top_indices = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return [(idx, text_chunks[idx]) for idx, _ in top_indices[:top_k]]

# Rerank Results using Flan-T5
def rerank_with_t5(query, candidates, top_n=5):
    rerank_inputs = [
        f"Query: {query}\nPassage: {text}" for _, text in candidates
    ]
    
    scores = []
    for input_text in rerank_inputs:
        inputs = t5_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to("cpu")
        with torch.no_grad():
            outputs = t5_model.generate(**inputs, max_length=5)
        score_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
        score = 1 if "yes" in score_text.lower() else 0
        scores.append(score)

    # Sort candidates by scores
    reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    return [text for (idx, text), _ in reranked[:top_n]]

# Summarize Top Passages
def summarize_passages(passages):
    combined_text = " ".join(passages)
    input_text = f"summarize: {combined_text}"
    inputs = t5_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cpu")

    with torch.no_grad():
        summary_ids = t5_model.generate(**inputs, max_length=100)
    
    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Run End-to-End Search with Summary
def run_search(query):
    print(f"Query: {query}\n")

    hybrid_results = hybrid_search(query, top_k=10)
    reranked = rerank_with_t5(query, hybrid_results, top_n=3)

    for i, passage in enumerate(reranked, 1):
        print(f"{i}. {passage}\n")

    summary = summarize_passages(reranked)
    print(f"Summary:\n{summary}")



In [21]:
run_search("தமிழகத்தில் உள்ள பழமையான கல்வெட்டுகள்")
run_search("Ancient inscriptions in Tamil Nadu")

🔍 Query: தமிழகத்தில் உள்ள பழமையான கல்வெட்டுகள்

1. . தமிழகத்தில் மட்டும் சுமார் 25,000 கல்வெட்டுகள் . வரலாற்றுக்கு தேவையான அடிப்படைச் சான்றுகளை இக்கல்வெட்டுகளின் அருமை காரணமாக அவற்றைப்படிக்க வேண்டி " கல்வெட்டு முனைப்புத் திட்டம் ' சிறப்புத்திட்டத்தின் கீழ் 2004 செப்டம்பர் மாதம் தொடங்கி கல்வெட்டுகள் படியெடுக்கப்பட்டு வருகின்றன . இக்கல்வெட்டு முனைப்புத் திட்டத்தின் வாயிலாக ஆகஸ்ட் 2009 வரை 14,531 கலீவெட்டுகளும் ஏற்கனவே படியெடுத்த 7,833 கல்வெட்டுகளும் ஆக மொத்தம் 22,364 கல்வெட்டுகளும் இதுவரை இத்துறையில் படியெடுக்கப்பட்டுள்ளன . இத்துறையின் ஓய்வு பெற்ற இயக்குநர்கள் திரு . நடன . காசிநாதன் மற்றும் திரு . கு . தாமோதரன் செப்பேடுகள் , பிராமி கல்வெட்டுகள் நடுகல் கல்வெட்டுகள் , பாடல் கல்வெட்டுகள் தமிழ் கல்வெட்டுகளில் முக்கியமானவற்றைத் தொகுத்து உரிய விளக்கவுரையுடன் சிறப்புற எழுதியுள்ளனர் . , " கல்வெட்டு அறிமுகம் . இந்நூல் வரலாற்று ஆய்வு மாணவர்கள் வரலாற்று ஆர்வலர்களிடையே உள்ள வரவேற்புக்கிணங்க மறுபதிப்பு சிறப்புடன்

2. ஊன்றியுமுன்ளனர் . வலக்கை தர்சினி முத்திரையிலும் , இடக்கை கதையின் மீதும் . இதன் காலம்