In [1]:
# Run once in Colab
!pip install --quiet scikit-learn pypdf2 nltk streamlit faiss-cpu
# faiss-cpu is optional; we don't need it strictly, but it's useful if you want faster retrieval later


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from pathlib import Path
import pickle
import io
from typing import List, Tuple

# Text processing
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# PDF loader
from PyPDF2 import PdfReader

# Vectorizer and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Small utility
STOPWORDS = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
def load_pdf_text(path: str) -> str:
    text_chunks = []
    reader = PdfReader(path)
    for page in reader.pages:
        try:
            text = page.extract_text()
        except Exception:
            text = ""
        if text:
            text_chunks.append(text)
    return "\n".join(text_chunks)

def load_txt(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def clean_text(text: str) -> str:
    # Basic clean-up
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def chunk_text_by_sentences(text: str, max_sentences:int=6, overlap:int=1) -> List[str]:
    # Make chunks of up to max_sentences sentences, with small overlap
    sents = sent_tokenize(text)
    chunks = []
    i = 0
    while i < len(sents):
        chunk = sents[i:i+max_sentences]
        chunks.append(" ".join(chunk))
        i += max_sentences - overlap
    return chunks


In [4]:
from google.colab import files
uploaded = files.upload()  # choose PDF or .txt files
print("Uploaded:", list(uploaded.keys()))


Saving 12345.txt to 12345.txt
Uploaded: ['12345.txt']


In [5]:
os.makedirs("documents", exist_ok=True)
for fname in uploaded.keys():
    with open(os.path.join("documents", fname), "wb") as f:
        f.write(uploaded[fname])


In [7]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
doc_dir = Path("documents")
all_chunks = []   # will hold tuples (doc_name, chunk_text)
for fp in doc_dir.glob("*"):
    fname = fp.name
    ext = fp.suffix.lower()
    if ext in ['.pdf']:
        raw = load_pdf_text(str(fp))
    elif ext in ['.txt']:
        raw = load_txt(str(fp))
    else:
        # skip unknown file types (or add handlers)
        print("Skipping unknown file type:", fname)
        continue

    raw = clean_text(raw)
    chunks = chunk_text_by_sentences(raw, max_sentences=6, overlap=2)
    for i, ch in enumerate(chunks):
        if len(ch.strip()) < 50:
            continue
        all_chunks.append({
            "doc": fname,
            "chunk_id": f"{fname}__{i}",
            "text": ch
        })

print("Total chunks created:", len(all_chunks))
# Optionally inspect first chunk
if all_chunks:
    print("Example chunk:", all_chunks[0])


Total chunks created: 3
Example chunk: {'doc': '12345.txt', 'chunk_id': '12345.txt__0', 'text': 'leave policy: Employees can take 12 casual leaves and 12 sick leaves per year. work hours: The standard working hours are 9 AM to 6 PM. overtime policy: Overtime is compensated with extra pay or time-off. dress code: Employees are expected to follow business casual attire. salary credit: Salaries are credited on the last working day of every month. insurance: All employees receive health insurance coverage after probation.'}


In [10]:
corpus = [c["text"] for c in all_chunks]

# Vectorizer: you can tweak ngram_range, max_features etc.
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.9,
    min_df=1,
    ngram_range=(1,2)
)

tfidf_matrix = vectorizer.fit_transform(corpus)  # shape: (n_chunks, n_features)
print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Save index & metadata for reuse
os.makedirs("kb_index", exist_ok=True)
with open("kb_index/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("kb_index/chunks.pkl", "wb") as f:
    pickle.dump(all_chunks, f)
with open("kb_index/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)
print("Saved vectorizer + chunks + matrix to kb_index/")


TF-IDF matrix shape: (3, 113)
Saved vectorizer + chunks + matrix to kb_index/


In [11]:
import numpy as np

# if you prefer, load from pickle (here we already have them in memory)
# with open("kb_index/vectorizer.pkl","rb") as f: vectorizer = pickle.load(f)
# with open("kb_index/chunks.pkl","rb") as f: all_chunks = pickle.load(f)
# with open("kb_index/tfidf_matrix.pkl","rb") as f: tfidf_matrix = pickle.load(f)

def retrieve(query: str, top_k:int=3) -> List[Tuple[float, dict]]:
    q = clean_text(query)
    q_vec = vectorizer.transform([q])
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]  # array of similarities
    top_idx = np.argsort(sims)[::-1][:top_k]
    results = []
    for idx in top_idx:
        results.append((float(sims[idx]), all_chunks[idx]))
    return results

# Quick test
print(retrieve("What are the leave policies?", top_k=3))


[(0.1076801377777102, {'doc': '12345.txt', 'chunk_id': '12345.txt__0', 'text': 'leave policy: Employees can take 12 casual leaves and 12 sick leaves per year. work hours: The standard working hours are 9 AM to 6 PM. overtime policy: Overtime is compensated with extra pay or time-off. dress code: Employees are expected to follow business casual attire. salary credit: Salaries are credited on the last working day of every month. insurance: All employees receive health insurance coverage after probation.'}), (0.0, {'doc': '12345.txt', 'chunk_id': '12345.txt__2', 'text': 'probation period: The probation period is 3 months for new employees.'}), (0.0, {'doc': '12345.txt', 'chunk_id': '12345.txt__1', 'text': 'salary credit: Salaries are credited on the last working day of every month. insurance: All employees receive health insurance coverage after probation. holiday list: The company follows 12 national holidays per year. transport: Office transport is available from major city points. prob

In [12]:
def answer_query(query:str, top_k:int=3, show_scores:bool=True) -> str:
    results = retrieve(query, top_k=top_k)
    if not results:
        return "No relevant content found in the knowledge base."
    lines = []
    for score, chunk in results:
        title = f"[{chunk['doc']} | {chunk['chunk_id']}]"
        if show_scores:
            lines.append(f"{title}  (score={score:.3f})\n{chunk['text']}\n")
        else:
            lines.append(f"{title}\n{chunk['text']}\n")
    return "\n---\n".join(lines)

# Example
print(answer_query("How many sick leaves are allowed?", top_k=2))


[12345.txt | 12345.txt__0]  (score=0.249)
leave policy: Employees can take 12 casual leaves and 12 sick leaves per year. work hours: The standard working hours are 9 AM to 6 PM. overtime policy: Overtime is compensated with extra pay or time-off. dress code: Employees are expected to follow business casual attire. salary credit: Salaries are credited on the last working day of every month. insurance: All employees receive health insurance coverage after probation.

---
[12345.txt | 12345.txt__2]  (score=0.000)
probation period: The probation period is 3 months for new employees.

