In [1]:
# ---------- installs ----------
!pip install -q sentence-transformers faiss-gpu transformers[sentencepiece] accelerate
!pip install -q faiss-cpu

# ---------- imports ----------
import os, re, json
from tqdm import tqdm
import pandas as pd
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# ---------- CONFIG ----------
INPUT_CSV = "/content/drive/MyDrive/Data Folder/processed_text.csv"  # your CSV path
TEXT_COL = None   # auto-detect if None
CHUNK_SIZE = 300     # words per chunk
CHUNK_STRIDE = 50
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
FAISS_INDEX_PATH = "/content/drive/MyDrive/Data Folder/faiss_index.bin"
METADATA_PATH = "/content/drive/MyDrive/Data Folder/chunks_metadata.json"
BATCH_SIZE = 64
TOP_K = 5
SUMMARIZER_MODEL = "facebook/bart-base"

# ---------- mount drive ----------
from google.colab import drive
drive.mount('/content/drive')

# ---------- helper: detect text column ----------
def detect_text_column(df):
    for c in ("text", "content", "message", "body", "tweet", "review"):
        if c in df.columns:
            return c
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols:
        raise ValueError("No object columns found.")
    best, best_len = None, -1
    for c in obj_cols:
        try:
            avg = df[c].dropna().astype(str).map(len).mean()
            if avg > best_len:
                best_len = avg; best = c
        except:
            continue
    return best

# ---------- load CSV ----------
print("Loading CSV:", INPUT_CSV)
df = pd.read_csv(INPUT_CSV, dtype=str, low_memory=False)
if TEXT_COL is None:
    TEXT_COL = detect_text_column(df)
print("Using text column:", TEXT_COL)

# ---------- chunking ----------
def chunk_text(text, chunk_size=CHUNK_SIZE, stride=CHUNK_STRIDE):
    if not isinstance(text, str) or text.strip() == "":
        return []
    words = text.split()
    chunks = []
    i = 0
    n = len(words)
    while i < n:
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        if i + chunk_size >= n:
            break
        i += chunk_size - stride
    return chunks

# ---------- build chunks and metadata ----------
all_chunks = []
metadata = []
doc_ids = df.index.tolist()
print("Creating chunks...")
for doc_idx in tqdm(doc_ids):
    raw = df.loc[doc_idx, TEXT_COL]
    chunks = chunk_text(raw)
    for cid, ch in enumerate(chunks):
        metadata.append({"doc_idx": int(doc_idx), "chunk_id": cid})
        all_chunks.append(ch)
print("Total chunks:", len(all_chunks))

# ---------- embeddings ----------
device_str = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading embedder on device:", device_str)
embedder = SentenceTransformer(EMBED_MODEL_NAME, device=device_str)

embeddings = []
for i in tqdm(range(0, len(all_chunks), BATCH_SIZE), desc="Embedding batches"):
    batch = all_chunks[i:i+BATCH_SIZE]
    embs = embedder.encode(batch, show_progress_bar=False, convert_to_numpy=True)
    embeddings.append(embs)
embeddings = np.vstack(embeddings).astype('float32')
print("Embeddings shape:", embeddings.shape)

# ---------- FAISS index ----------
dim = embeddings.shape[1]
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print("FAISS index size:", index.ntotal)

faiss.write_index(index, FAISS_INDEX_PATH)
with open(METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(metadata, f)
print("Saved FAISS index and metadata.")

# ---------- summarizer ----------
device = 0 if torch.cuda.is_available() else -1
print("Loading summarizer on device:", "GPU" if device == 0 else "CPU")
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=device)

# ---------- retrieval + summarization functions ----------
def retrieve_topk_for_text(text, top_k=TOP_K):
    chunks = chunk_text(text)
    if not chunks:
        return []
    ch_embs = embedder.encode(chunks, convert_to_numpy=True)
    faiss.normalize_L2(ch_embs)
    D, I = index.search(ch_embs, top_k)
    ids = np.unique(I.flatten())
    retrieved = [all_chunks[int(i)] for i in ids if int(i) < len(all_chunks)]
    return retrieved

def summarize_from_retrieved(retrieved_chunks, max_length=130, min_length=30):
    summaries = []
    for chunk in retrieved_chunks:
        words = chunk.split()
        if len(words) > 200:   # truncate each chunk to 200 words
            chunk = " ".join(words[:200])
        chunk = re.sub(r"[^ -~\n]", " ", chunk)  # keep ASCII printable chars
        summ = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summ[0]['summary_text'])
    return " ".join(summaries)

# ---------- test summarization ----------
n = min(3, len(df))
for i in range(n):
    doc_text = df.loc[df.index[i], TEXT_COL]
    retrieved = retrieve_topk_for_text(doc_text)
    print("\n\n=== Document", i, "retrieved chunks count:", len(retrieved))
    print("First 300 chars of retrieved context:\n", " ".join(retrieved)[:300])
    summ = summarize_from_retrieved(retrieved)
    print("\nSummary:\n", summ)

print("Done. Now you can query and summarize any text using this pipeline.")


[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Loading CSV: /content/drive/MyDrive/Data Folder/processed_text.csv
Using text column: text
Creating chunks...


100%|██████████| 14235/14235 [00:00<00:00, 56792.46it/s]


Total chunks: 14236
Loading embedder on device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding batches: 100%|██████████| 223/223 [00:39<00:00,  5.69it/s]


Embeddings shape: (14236, 384)
FAISS index size: 14236
Saved FAISS index and metadata.
Loading summarizer on device: GPU


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




=== Document 0 retrieved chunks count: 5
First 300 chars of retrieved context:
 Bid Details Bid End Date Time 20-02-2025 15:00:00 Bid Opening Date Time 20-02-2025 15:30:00 Bid Offer Validity (From End Date) ( ) 90 (Days) Ministry State Name ! Ministry Of Housing & Urban Affairs (mohua) Department Name Na Organisation Name Kochi Metro Rail Limited Office Name % Kochi Metro Rail 


Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Summary:
 Bid Details Bid End Date Time 20-02-2025 15:00:00 Bid Opening Date Time 10-02/2025 10:30:00 bid Offer Validity (From End Date) ( ) 90 (Days) Ministry State Name ! Ministry Of Housing & Urban Affairs (mohua) Department Name Na Organisation Name Kochi Metro Rail Limited Office Name % KochiMetro Rail Limited & Buyer Email mmt.kmrl@kerala.gov.in Item Category Financial Audit Services - As per ATC; Audit Firm Contract Period 3 Year(s) MSE Exemption for Years of Experience and Turnover , No % MSE (From Beginning to End Date), 0 % No Document required from seller & 0 Certificate (Requested in ATC),Additional Doc 2 (Requesting in A TC),Additional Certificates,Additional Doc 3 (Requester),Additional Document 4 ( Requested in aTC), Additional Doc 4 (Requeste in Atc) In case any bidder is seeking exemption from Experience Turnover Criteria, the supporting documents to prove his eligibility for exemption must be uploaded for evaluation by the buyer Do you want to show documents uploaded

Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Summary:
 Bid Details Bid End Date Time 20-02-2025 15:00:00 Bid Opening Date Time 10-02/2025 10:30:00 bid Offer Validity (From End Date) ( ) 90 (Days) Ministry State Name ! Ministry Of Housing & Urban Affairs (mohua) Department Name Na Organisation Name Kochi Metro Rail Limited Office Name % KochiMetro Rail Limited & Buyer Email mmt.kmrl@kerala.gov.in Item Category Financial Audit Services - As per ATC; Audit Firm Contract Period 3 Year(s) MSE Exemption for Years of Experience and Turnover , No % MSE (From Beginning to End Date), 0 % No Document required from seller & 0 Certificate (Requested in ATC),Additional Doc 2 (Requesting in A TC),Additional Certificates,Additional Doc 3 (Requester),Additional Document 4 ( Requested in aTC), Additional Doc 4 (Requeste in Atc) In case any bidder is seeking exemption from Experience Turnover Criteria, the supporting documents to prove his eligibility for exemption must be uploaded for evaluation by the buyer Do you want to show documents uploaded

Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=130) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Summary:
 Financial Document Indicating Price Breakup Required 6 % B 0 D F Yes Arbitration Clause No Mediation Clause No Bid Details EMD Detail Required F No ePBG Detail Required M No MII Compliance M II Compliance Yes MSE Purchase Preference MSEPurchase Preference Yes 1. Purchase preference to Micro and Small Enterprises (MSEs): Purchase preference will be given to MSEs as defined in Public Procurement Policy for Micro- Small Enterprises and its subsequent Orders Notifications issued by Ministry of Micro, Small and Medium Enterprises. If the bidder wants to avail the Purchase preference for services, the bidder must be the Service provider of the offered Service. Relevant documentary evidence in this regard shall be uploaded along with the bid in respect of the offering service. If L-1 is not an MSE and MSE Service Provider (s) has have quoted price within L-2 15% of margin of purchase preference price band as per the terms and conditions of the offer. with the bid in respect of the 