<a href="https://colab.research.google.com/github/Srabontideb/Explainable-Dual-stage-Rag/blob/main/Dense_Retriever_(Contriever)_WITH_FASIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DRIVE MOUNT**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Setup and Install Dependencies**

In [None]:
!pip install transformers faiss-cpu

# **Load Contriever Model**

In [None]:
!pip install pillow==10.2.0

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
model = AutoModel.from_pretrained("facebook/contriever")
model.eval()

def embed_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :]
    return embeddings.cpu().numpy()

# **Load libraries**

In [None]:
import pandas as pd
import torch
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# ✅ Load dataset
csv_path = "/content/drive/MyDrive/Colab Notebooks/cleaned_and_tokenized_and_entity-defined_healthcaremagic.csv"
df = pd.read_csv(csv_path)
corpus = df["response_clean"].dropna().tolist()

# ✅ Load Contriever model + tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
model = AutoModel.from_pretrained("facebook/contriever")
model.eval()  # CPU only

# **Embed Corpus + Create FAISS Index**

In [None]:
import os
import faiss
import numpy as np
from tqdm import tqdm

# ✅ Ensure save directory exists
save_dir = "/content/drive/MyDrive/faiss_chunks"
os.makedirs(save_dir, exist_ok=True)

# ✅ Define embedding function if not defined yet
def embed_texts(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding chunk"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            output = model(**inputs).last_hidden_state[:, 0, :]
        embeddings.append(output.cpu().numpy())
    return np.vstack(embeddings)


# **RUN MODEL FOR FIRST 10K**

In [None]:
# ✅ Parameters
chunk_size = 10000
embedding_dim = 768  # For Contriever

for i in range(0, len(corpus), chunk_size):
    chunk_texts = corpus[i:i + chunk_size]
    print(f"🔹 Processing chunk {i} to {i+len(chunk_texts)}")

    # Embed chunk
    chunk_embeddings = embed_texts(chunk_texts)

    # Create new FAISS index for this chunk
    chunk_index = faiss.IndexFlatL2(embedding_dim)
    chunk_index.add(chunk_embeddings)

    # Save index chunk
    chunk_path = f"{save_dir}/faiss_chunk_{i}_{i + len(chunk_texts)}.faiss"
    faiss.write_index(chunk_index, chunk_path)
    print(f"✅ Saved chunk index → {chunk_path}")


# **RUN MODEL FOR ANY NEXT 10K(JUST CHANGE THE STATING INDEX)**

In [None]:
# ✅ Parameters
chunk_size = 10000
embedding_dim = 768  # For Contriever
save_dir = "/content/drive/MyDrive/faiss_chunks"  # Your save location

# ✅ Manually set where to start
start_idx = 10000   # Change to your last completed + 1
end_idx = len(corpus)  # or set manually if testing

for i in range(start_idx, end_idx, chunk_size):
    chunk_texts = corpus[i:i + chunk_size]
    print(f"🔹 Processing chunk {i} to {i+len(chunk_texts)}")

    # Embed chunk
    chunk_embeddings = embed_texts(chunk_texts)

    # Create new FAISS index for this chunk
    chunk_index = faiss.IndexFlatL2(embedding_dim)
    chunk_index.add(chunk_embeddings)

    # Save index chunk
    chunk_path = f"{save_dir}/faiss_chunk_{i}_{i + len(chunk_texts)}.faiss"
    faiss.write_index(chunk_index, chunk_path)
    print(f"✅ Saved chunk index → {chunk_path}")


# **NEXT EXTRA**

In [None]:
import os
import faiss
from tqdm import tqdm
import numpy as np

# ✅ Parameters
chunk_size = 10000
embedding_dim = 768  # For Contriever
batch_size = 256     # Smaller = more frequent saves to memory
save_dir = "/content/drive/MyDrive/faiss_chunks"
os.makedirs(save_dir, exist_ok=True)

# ✅ Where to resume
start_idx = 10000  # Change to last saved + 1
end_idx = len(corpus)

for i in range(start_idx, end_idx, chunk_size):
    chunk_texts = corpus[i:i + chunk_size]
    chunk_filename = f"faiss_chunk_{i}_{i + len(chunk_texts)}.faiss"
    chunk_path = os.path.join(save_dir, chunk_filename)

    if os.path.exists(chunk_path):
        print(f"⏩ Skipping existing chunk → {chunk_path}")
        continue

    print(f"🔹 Processing chunk {i} to {i + len(chunk_texts)}")

    try:
        # Create empty FAISS index
        chunk_index = faiss.IndexFlatL2(embedding_dim)

        # Embed in mini-batches
        for j in tqdm(range(0, len(chunk_texts), batch_size), desc="Embedding mini-batches"):
            batch_texts = chunk_texts[j:j + batch_size]
            batch_embeddings = embed_texts(batch_texts)  # returns np.array
            chunk_index.add(batch_embeddings)

        # Save this chunk
        faiss.write_index(chunk_index, chunk_path)
        print(f"✅ Saved chunk index → {chunk_path}")

    except Exception as e:
        print(f"❌ Error processing chunk {i} → {e}")
        break
