In [None]:
pip install transformers sentence-transformers faiss-cpu pypdf nltk


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.8/23.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf-6.5.0-py3-none-any.whl (329 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m329.6/329.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.13.2 pypdf-6.5.0


In [None]:
import os
import nltk
import faiss
import numpy as np

from pypdf import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer




In [None]:
nltk.download("punkt")
from nltk.tokenize import sent_tokenize


In [None]:
def load_document(file_path):
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()

    else:
        raise ValueError("Unsupported file format")


In [None]:
def chunk_text(text, max_sentences=5):
    sentences = sent_tokenize(text)
    chunks = []

    for i in range(0, len(sentences), max_sentences):
        chunk = " ".join(sentences[i:i+max_sentences])
        chunks.append(chunk)

    return chunks


In [None]:
# Summarization LLM
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
def summarize_document(text):
    summary = summarizer(
        text[:1024],
        max_length=150,
        min_length=50,
        do_sample=False
    )
    return summary[0]["summary_text"]


In [None]:
def generate_tags(summary):
    keywords = []
    for word in summary.lower().split():
        if len(word) > 6 and word.isalpha():
            keywords.append(word)

    return list(set(keywords))[:5]


In [None]:
def build_vector_store(chunks):
    embeddings = embedding_model.encode(chunks)
    dimension = embeddings.shape[1]

    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))

    return index, embeddings


In [None]:
def retrieve_chunks(question, chunks, index, top_k=3):
    query_embedding = embedding_model.encode([question])
    distances, indices = index.search(query_embedding, top_k)

    return [chunks[i] for i in indices[0]]


In [None]:
def answer_question(question, chunks, index):
    retrieved = retrieve_chunks(question, chunks, index)

    if not retrieved:
        return "SORRY I DON'T KNOW"

    context = " ".join(retrieved)

    prompt = (
        "Answer ONLY from the context below.\n"
        "If the answer is not in the context, say 'SORRY I DON'T KNOW'.\n\n"
        f"Context:\n{context}\n\nQuestion:\n{question}"
    )

    response = summarizer(
        prompt[:1024],
        max_length=120,
        min_length=40,
        do_sample=False
    )

    return response[0]["summary_text"]


In [None]:
def process_document(file_path):
    print("üìÑ Loading document...")
    text = load_document(file_path)

    print("‚úÇÔ∏è Chunking text...")
    chunks = chunk_text(text)

    print("üß† Summarizing document...")
    summary = summarize_document(text)

    print("üè∑Ô∏è Generating tags...")
    tags = generate_tags(summary)

    print("üì¶ Creating embeddings...")
    index, _ = build_vector_store(chunks)

    print("\n‚úÖ Document Processed Successfully\n")
    print("SUMMARY:\n", summary)
    print("\nTAGS:", tags)

    return chunks, index


In [None]:
if __name__ == "__main__":
    file_path = "documents/sample.pdf"

    chunks, index = process_document(file_path)

    print("\nüí¨ Ask questions (type 'exit' to quit)\n")

    while True:
        query = input("User: ")

        if query.lower() == "exit":
            break

        answer = answer_question(query, chunks, index)
        print("AI:", answer, "\n")
