In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install --upgrade huggingface_hub sentence-transformers

Collecting huggingface_hub
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.4/515.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━

In [None]:
# ✅ Clean install to fix version conflicts
!pip uninstall -y transformers sentence-transformers
!pip install -U sentence-transformers==2.2.2 transformers==4.31.0


Found existing installation: transformers 4.31.0
Uninstalling transformers-4.31.0:
  Successfully uninstalled transformers-4.31.0
Found existing installation: sentence-transformers 2.2.2
Uninstalling sentence-transformers-2.2.2:
  Successfully uninstalled sentence-transformers-2.2.2
Collecting sentence-transformers==2.2.2
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting transformers==4.31.0
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Installing collected packages: transformers, sentence-transformers
Successfully installed sentence-transformers-2.2.2 transformers-4.31.0


In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from tqdm import tqdm

# ✅ Paths
DATA_PATH = "/content/drive/MyDrive/complaint-insightbot/filtered_complaints.csv"
VECTOR_STORE_DIR = "/content/drive/MyDrive/complaint-insightbot/vector_store"
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

INDEX_PATH = os.path.join(VECTOR_STORE_DIR, "faiss_index")
METADATA_PATH = os.path.join(VECTOR_STORE_DIR, "metadata.pkl")

# ✅ Parameters
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
BATCH_SIZE = 128  # 👈 safer for T4 GPU, you can try 256 if you're sure
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# ✅ Load data
print("📥 Loading data...")
df = pd.read_csv(DATA_PATH)

# ✅ Chunking
print("✂️ Chunking texts...")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)

flattened = []
for idx, text in tqdm(enumerate(df['cleaned_narrative']), total=len(df), desc="🔨 Splitting"):
    try:
        chunks = splitter.split_text(text)
        complaint_id = df.iloc[idx].get('complaint_id', idx)
        product = df.iloc[idx].get('product_mapped', None)
        for i, chunk in enumerate(chunks):
            flattened.append({
                "chunk_id": f"{complaint_id}_{i}",
                "complaint_id": complaint_id,
                "product": product,
                "chunk_text": chunk
            })
    except Exception as e:
        print(f"⚠️ Failed to split text at row {idx}: {e}")

# ✅ Embedding
print("🔍 Embedding chunks...")
texts = [item["chunk_text"] for item in flattened]
model = SentenceTransformer(EMBEDDING_MODEL_NAME, device='cuda')

all_embeddings = []
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="🚀 Encoding"):
    batch = texts[i:i + BATCH_SIZE]
    try:
        emb = model.encode(batch, convert_to_numpy=True)
        all_embeddings.append(emb)
    except Exception as e:
        print(f"❌ Failed to embed batch {i}-{i+BATCH_SIZE}: {e}")

embeddings = np.vstack(all_embeddings)

# ✅ FAISS Index
print("📦 Building FAISS index...")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# ✅ Save
print("💾 Saving index and metadata...")
faiss.write_index(index, INDEX_PATH)
with open(METADATA_PATH, "wb") as f:
    pickle.dump(flattened, f)

print("✅ All done! Vector store is ready.")


📥 Loading data...
✂️ Chunking texts...


🔨 Splitting: 100%|██████████| 459583/459583 [01:54<00:00, 4006.29it/s]


🔍 Embedding chunks...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🚀 Encoding: 100%|██████████| 8947/8947 [26:38<00:00,  5.60it/s]


📦 Building FAISS index...
💾 Saving index and metadata...
✅ All done! Vector store is ready.


In [None]:
!pip install -q faiss-cpu sentence-transformers tqdm