### Project Title: Document Summarization using Retrieval-Augmented Generation (RAG)

In [6]:
import numpy as np
import faiss
import time
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re

# Download punkt tokenizer (for word_tokenize)
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load sentence embedding model
EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2")

# === Constants ===
TOP_K = 5               # Chunks to retrieve
CHUNK_SIZE = 500        # Words per chunk
MAX_SUMMARY_CHARS = 4000  # Max characters for LLM context

# === Utility Functions ===
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def estimate_tokens(text):
    try:
        return len(word_tokenize(text))
    except:
        return len(text.split())

# === Step 1: Chunking ===
def chunk_text(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# === Step 2: Embedding ===
def embed_chunks(chunks):
    return EMBED_MODEL.encode(chunks, convert_to_numpy=True)

# === Step 3: FAISS Index ===
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# === Step 4: Retrieval ===
def retrieve_chunks(query, chunks, embeddings, index, top_k=TOP_K):
    query_embed = EMBED_MODEL.encode([query], convert_to_numpy=True)
    D, I = index.search(query_embed, top_k)
    similarities = cosine_similarity(query_embed, embeddings)[0]
    return [(chunks[i], similarities[i]) for i in I[0]], query_embed

# === Step 5: Summarization ===
def summarize_with_huggingface(context_chunks, max_char_len=MAX_SUMMARY_CHARS):
    combined = clean_text(" ".join(context_chunks))
    if len(combined) > max_char_len:
        combined = combined[:max_char_len]
    summary = summarizer(combined, max_length=300, min_length=60, do_sample=False)[0]['summary_text']
    return summary, combined

# === Step 6: Full Pipeline per Article ===
def summarize_article(article_text):
    print("üß© Chunking...")
    start = time.time()
    chunks = chunk_text(article_text)
    print(f"‚è±Ô∏è Chunking took: {time.time() - start:.2f}s")

    print("üîé Embedding...")
    start = time.time()
    embeddings = embed_chunks(chunks)
    print(f"‚è±Ô∏è Embedding took: {time.time() - start:.2f}s")

    print("üì¶ Building vector index...")
    start = time.time()
    index = build_faiss_index(np.array(embeddings))
    print(f"‚è±Ô∏è Indexing took: {time.time() - start:.2f}s")

    print("üì• Retrieving relevant chunks...")
    start = time.time()
    retrieved, _ = retrieve_chunks("Summarize this document", chunks, embeddings, index)
    print(f"‚è±Ô∏è Retrieval took: {time.time() - start:.2f}s")

    print("\nüîç Top Retrieved Chunks with Similarity Scores:")
    context_chunks = []
    for i, (chunk, score) in enumerate(retrieved):
        print(f"\n--- Chunk {i+1} | Similarity: {score:.4f} ---")
        print(chunk[:300] + ("..." if len(chunk) > 300 else ""))
        context_chunks.append(chunk)

    print("\nüìù Generating summary...")
    start = time.time()
    summary, input_text = summarize_with_huggingface(context_chunks)
    print(f"‚è±Ô∏è Summarization took: {time.time() - start:.2f}s")

    print("\n‚úÖ Summary:")
    print(summary)

    print("\nüìä Stats:")
    print(f"- Approx. input tokens: {estimate_tokens(input_text)}")
    print(f"- Approx. summary tokens: {estimate_tokens(summary)}")

# === Main ===
if __name__ == "__main__":
    print("üìö Loading dataset...")
    ds = load_dataset("abisee/cnn_dailymail", "1.0.0", split="test")
    articles = [sample['article'] for sample in ds.select(range(3))]

    for idx, article in enumerate(articles):
        print(f"\n================= Article {idx + 1} =================")
        summarize_article(article)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\riyya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cpu


üìö Loading dataset...

üß© Chunking...
‚è±Ô∏è Chunking took: 0.00s
üîé Embedding...
‚è±Ô∏è Embedding took: 0.25s
üì¶ Building vector index...
‚è±Ô∏è Indexing took: 0.00s
üì• Retrieving relevant chunks...
‚è±Ô∏è Retrieval took: 0.03s

üîç Top Retrieved Chunks with Similarity Scores:

--- Chunk 1 | Similarity: 0.0943 ---
office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecu...

--- Chunk 2 | Similarity: -0.0110 ---
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the cou...

--- Chunk 3 | Similarity