In [None]:
!pip install sentence-transformers pypdf numpy langchain langchain-community langchain-text-splitters


In [None]:
import numpy as np
import re
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from langchain_text_splitters import RecursiveCharacterTextSplitter


In [None]:
EMBED_MODEL = "all-MiniLM-L6-v2"     # lightweight, fast
embedder = SentenceTransformer(EMBED_MODEL)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += "\n" + page_text
    return text


In [None]:
def clean_text(text: str) -> str:
    text = text.replace('\t', ' ')
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()


In [None]:
def chunk_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = splitter.split_text(text)
    return chunks


In [None]:
def embed_chunks(chunks, model):
    embeddings = model.encode(
        chunks,
        convert_to_tensor=False,
        normalize_embeddings=True
    )
    return np.array(embeddings)


In [None]:
def retrieve(query, chunks, chunk_embeddings, model, top_k=3):
    query_emb = model.encode([query], convert_to_tensor=False, normalize_embeddings=True)
    query_emb = np.array(query_emb)

    sims = cosine_similarity(query_emb, chunk_embeddings)[0]

    top_idx = np.argsort(sims)[::-1][:top_k]

    results = []
    for idx in top_idx:
        results.append({
            "chunk_index": idx,
            "chunk_text": chunks[idx],
            "similarity": float(sims[idx])
        })
    return results


In [None]:
def build_rag_for_resume(pdf_path: str):
    # Step 1: Extract
    raw = extract_text_from_pdf(pdf_path)

    # Step 2: Clean
    clean = clean_text(raw)

    # Step 3: Chunk (now perfect clean chunks)
    chunks = chunk_text(clean)

    # Step 4: Create embeddings
    embeddings = embed_chunks(chunks, embedder)

    return {
        "raw_text": raw,
        "clean_text": clean,
        "chunks": chunks,
        "embeddings": embeddings
    }


In [None]:
resume_data = build_rag_for_resume("vinay_resume_dec.pdf")

print("Total Clean Chunks:", len(resume_data["chunks"]))


Total Clean Chunks: 7


In [None]:
query = "machine learning experience"

results = retrieve(
    query,
    resume_data["chunks"],
    resume_data["embeddings"],
    embedder,
    top_k=5

)

for i, r in enumerate(results, start=1):
    print(f"\n--- Retrieved Chunk {i} (index={r['chunk_index']}, similarity={r['similarity']:.4f}) ---")
    print(r["chunk_text"])



--- Retrieved Chunk 1 (index=6, similarity=0.3501) ---
Technical Skills
Programming Languages: Python, C++, C, Java, MATLAB, SQL
Web Development: HTML, CSS, JavaScript, React, Angular, Flask, Node.js
Databases: MySQL, PostgreSQL, SQLite
Tools/Frameworks: VS Code, Eclipse, Material-UI, Docker, Kubernetes, Linux, Git
AI/Data Science: Machine Learning, Scikit-learn, Pandas, Matplotlib, Data Analysis
Coding Profile
Solved 300+ problems on LeetCode in Data Structures and Algorithms, demonstrating strong problem-solving
and algorithmic skills.

--- Retrieved Chunk 2 (index=1, similarity=0.3069) ---
Core Computer Science: Data Structures and Algorithms, Operating Systems, Database Management
Systems, Computer Networks, Theory of Computation, Software Engineering and Project Management
AI and Data Science: Machine Learning, Artificial Intelligence, Cognitive Sciences,NLP
Cloud Software Engineering: Cloud Computing, DevOps
Projects
Plagiarism Detection System — NLP, Machine Learning [GitHub] O

In [14]:
import pickle

with open("resume_rag.pkl", "wb") as f:
    pickle.dump(resume_data, f)


In [15]:
import json

data_to_save = {
    "chunks": resume_data["chunks"],
    "embeddings": resume_data["embeddings"].tolist()
}

with open("resume_rag.json", "w") as f:
    json.dump(data_to_save, f)
