In [26]:
# !unzip Resume-markdown-docling-zip.zip

Archive:  Resume-markdown-docling-zip.zip
replace Resume-markdown-docling/(Shivam Dubey)-Resume - Shivam Shailendra Dubey.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

### Installations

In [1]:
# !pip install docling
# !pip install -U sentence-transformers
# !pip install rank-bm25
# !pip install faiss-cpu
# !pip install langchain
# !pip install openai
# !pip install -U sentence-transformers

### Imports

In [3]:
import os
import json
from pathlib import Path
from typing import Dict, List, Optional, Any
import hashlib
import statistics

from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from sentence_transformers import SentenceTransformer
import pickle
import re
from typing import List, Dict, Any
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document

import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity



import openai

from openai import OpenAI



# Constants
RESUME_PDF_PATH = "Submit your resume or CV (File responses)"
RESUME_MARKDOWN_PATH = "Resume-markdown-docling"
MAX_RESUMES = 200

### PDF to Markdown Conversion using docling and saving it

In [5]:
# @title
# # Basic PDF to Markdown conversion
# from pathlib import Path

# # Setup paths
# pdf_dir = Path("Submit your resume or CV (File responses)")
# output_dir = Path("Resume-markdown-docling")
# output_dir.mkdir(exist_ok=True)

# # Get first 200 PDFs
# pdf_files = list(pdf_dir.glob("*.pdf"))[:200]
# print(f"Found {len(pdf_files)} PDFs to convert")

# # Convert PDFs
# converter = DocumentConverter()
# successful = 0

# for pdf_file in pdf_files:
#     try:
#         result = converter.convert(str(pdf_file))
#         markdown_content = result.document.export_to_markdown()

#         output_file = output_dir / f"{pdf_file.stem}.md"
#         output_file.write_text(markdown_content, encoding='utf-8')

#         successful += 1
#         if successful % 20 == 0:
#             print(f"Converted {successful} files")

#     except Exception as e:
#         print(f"Failed: {pdf_file.name}")

# print(f"Conversion complete: {successful}/{len(pdf_files)} successful")

### Chunking the markdowns based on custom rules
=> 3+ separated capital letters to be joined => due to conversion between pdf and markdown

=> splitting based on common headers in resume such as experience , summary etc using regex + it needs to be after ## => thats how docling does it


In [4]:

markdown_dir=Path(r"C:\Users\psura\Repositories\ResumeRAG\rag-resume-screening\data\processed\markdown\Resume-markdown-docling")
def fix_spaced_caps(s: str) -> str:
    pattern = re.compile(r'(?<!\w)(?:[A-Z]\s+){2,}[A-Z](?!\w)')  # 3+ capital letters separated by spaces
    def _join(m):
        return m.group(0).replace(' ', '')
    return pattern.sub(_join, s)

def container_chunking(content: str, resume_id: str) -> List[Document]:
    # Remove image tags
    content = re.sub(r'<!-- image -->', '', content)
    #Removing spacing S K I L L -> SKILL
    # Fix spaced out words like "e x p e r i e n c e" or "E X P E R I E N C E"
    # content = re.sub(r'\b(\w)\s+(\w)\s+(\w)(\s+\w)*\b', lambda m: re.sub(r'\s+', '', m.group()), content)
    content = fix_spaced_caps(content)

    containers = [
        r'about\s*me', r'summary', r'profile', r'experience', r'work\s+experience',
        r'education', r'skill[s]?', r'project[s]?', r'achievement[s]?', r'award[s]?',
        r'publication[s]?', r'competition[s]?', r'hackathon[s]?', r'certification[s]?',
    ]
    container_alt = '|'.join(containers)

    # Anchor to line start, any H1–H6, match only the heading line
    pattern = rf'(?=^#{{1,6}}\s*(?:.*\b(?:{container_alt})\b).*$)'
    chunks = re.split(pattern, content, flags=re.IGNORECASE | re.MULTILINE)

    # Filter empty chunks and create documents
    docs = []
    for i, chunk in enumerate(chunks):
        chunk = chunk.strip()
        if chunk and len(chunk) > 50:
            doc = Document(
                page_content=chunk,
                metadata={'resume_id': resume_id, 'chunk_id': i}
            )
            docs.append(doc)

    return docs

# Process all resumes
all_chunks = []
for md_file in markdown_dir.glob("*.md"):
    content = md_file.read_text(encoding='utf-8')
    resume_id = md_file.stem
    chunks = container_chunking(content, resume_id)
    all_chunks.extend(chunks)

print(f"Created {len(all_chunks)} chunks from {len(list(markdown_dir.glob('*.md')))} resumes")
print(f"Average chunks per resume: {len(all_chunks) / len(list(markdown_dir.glob('*.md'))):.1f}")

Created 263 chunks from 52 resumes
Average chunks per resume: 5.1


Embedding Models

Creating embeddings of each chunk in the document(resume) list and storing them in a pickle file (so don't have to keep running it again)
PS only works on colab due to version differences. generate on vscode separately

In [6]:

#CREATING OPEN AI EMBEDDINGS FOR ALL CHUNKS
# load
# os.environ["OPENAI_API_KEY"] = (
#     userdata.get("OPENAI_API_KEY") or userdata.get("openai_api_key")
# )
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in .env file")

print("✅ API key loaded successfully")
client=OpenAI()
def get_openai_embeddings(texts: list[str], model: str = "text-embedding-3-small", batch_size: int = 100) -> np.ndarray:
    embeddings: list[np.ndarray] = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[i : i + batch_size]
        resp = client.embeddings.create(model=model, input=batch)
        embeddings.extend([np.array(item.embedding, dtype=np.float32) for item in resp.data])
    return np.vstack(embeddings) if embeddings else np.zeros((0, 0), dtype=np.float32)

# prepare chunks (prefer in-memory all_chunks, else load saved chunks)
try:
    chunks = all_chunks  # type: ignore[name-defined]
except NameError:
    if os.path.exists("resume_chunks.pkl"):
        with open("resume_chunks.pkl", "rb") as f:
            chunks = pickle.load(f)
    else:
        raise RuntimeError("No in-memory chunks and resume_chunks.pkl not found.")

chunk_texts = [doc.page_content for doc in chunks]
print(f"Generating OpenAI embeddings for {len(chunk_texts)} chunks...")

# choose "text-embedding-3-small" or "text-embedding-3-large"
embeddings = get_openai_embeddings(chunk_texts, model="text-embedding-3-small", batch_size=100)

# ====================================================================================================================
# SAVING THE EMBEDDINGS OF CHUNKS AND STORING IN PICKLE FILE
with open("resume_chunks_openai.pkl", "wb") as f:
    pickle.dump(chunks, f)

with open("resume_embeddings_openai.pkl", "wb") as f:
    pickle.dump(embeddings, f)

print(f"Saved {len(chunks)} chunks and embeddings -> resume_chunks_openai.pkl, resume_embeddings_openai.pkl")
print(f"Embeddings shape: {embeddings.shape}")






✅ API key loaded successfully
Generating OpenAI embeddings for 263 chunks...


Embedding batches:   0%|          | 0/3 [00:00<?, ?it/s]2025-10-22 16:01:14,484 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding batches:  33%|███▎      | 1/3 [00:02<00:05,  2.76s/it]2025-10-22 16:01:16,474 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding batches:  67%|██████▋   | 2/3 [00:04<00:02,  2.15s/it]2025-10-22 16:01:17,975 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding batches: 100%|██████████| 3/3 [00:05<00:00,  2.00s/it]

Saved 263 chunks and embeddings -> resume_chunks_openai.pkl, resume_embeddings_openai.pkl
Embeddings shape: (263, 1536)





Trying Hybrid Retrieval


For resume based querying, just semantic search doesn't cut it, a lot of the times, the recruiters are looking for some keywords, with semantic similarity it can't match exact keywords hence we need to combine approaches; keyword matching +semantic similarity

In [7]:
 # Load saved chunks and embeddings
with open('resume_chunks_openai.pkl', 'rb') as f:
    chunks = pickle.load(f)

with open('resume_embeddings_openai.pkl', 'rb') as f:
    embeddings = pickle.load(f)

### BM25 scoring

In [None]:

_CLEAN_RX = re.compile(r'[^\w\s]')
_WS_RX = re.compile(r'\s+')

def clean_and_tokenize(text: str) -> List[str]:
    text = text.lower()
    text = _CLEAN_RX.sub(' ', text)     # remove punctuation
    text = _WS_RX.sub(' ', text).strip()
    return [t for t in text.split(' ') if t]

class BM25Index:
    def __init__(self):
        self.bm25 = None
        self.docs: List[Document] = []
        self.doc_tokens: List[List[str]] = []

    def fit(self, chunks: List[Document]):
        self.docs = chunks
        self.doc_tokens = [clean_and_tokenize(d.page_content) for d in chunks]
        self.bm25 = BM25Okapi(self.doc_tokens)

    def search(self, query: str, top_k: int = 200) -> List[Dict[str, Any]]:
        if self.bm25 is None:
            raise RuntimeError("Call fit(chunks) before search().")
        q_tokens = clean_and_tokenize(query)
        scores = self.bm25.get_scores(q_tokens)
        # top-k indices by score
        top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
        out = []
        for rank, i in enumerate(top_idx, 1):
            d = self.docs[i]
            out.append({
                "rank": rank,
                "bm25_score": float(scores[i]),
                "resume_id": (d.metadata or {}).get("resume_id"),
                "chunk_id": (d.metadata or {}).get("chunk_id"),
                "preview": d.page_content[:400]
            })
        return out
bm25 = BM25Index()

In [None]:
#checking the retrieval

bm25.fit(chunks)  # chunks = your List[Document] from the chunker

query = "Urban Water Logging Detection for Timely Intervention and Mitigation"
hits = bm25.search(query, top_k=200)

print(f"BM25 hits: {len(hits)}")
for h in hits[:5]:
    print(f"[{h['rank']:>2}] {h['bm25_score']:.3f}  resume={h['resume_id']}  chunk={h['chunk_id']}")
    print("   ", (h["preview"] or "").splitlines()[0][:540], "...")


BM25 hits: 200
[ 1] 30.712  resume=Resume Mollika - Mollika Garg  chunk=3
    ## SELECTED PROJECTS ...
[ 2] 16.043  resume=Resume Mollika - Mollika Garg  chunk=4
    ## ACHIEVEMENTS ...
[ 3] 10.347  resume=Resume.PunithHM - Punith H M  chunk=2
    ## Projects ...
[ 4] 9.850  resume=Zulfikar_resume - Zulfikar Charoliya  chunk=2
    ## PROJECTS ...
[ 5] 5.875  resume=Basit_s AI Resume - Basit Ali  chunk=2
    ## PERSONAL PROJECTS ...


### Dense Retrieval

In [None]:
#using openai dense embeddings for retrieval
os.environ["OPENAI_API_KEY"] = (
    userdata.get("OPENAI_API_KEY") or userdata.get("openai_api_key")
)
client=OpenAI()


class DenseIndexSimple:
    def __init__(self):
        self.docs: List[Document] = []
        self.Xn: np.ndarray | None = None  # L2-normalized embeddings (N, D)
        self.dim: int | None = None
        self.model_name: str | None = None

    def fit(self, chunks: List[Document], embeddings: np.ndarray, model_name: str):
        """
        chunks: your List[Document]
        embeddings: np.ndarray of shape (N, D) aligned with chunks
        model_name: the embedding model used (e.g., 'text-embedding-3-small')
        """
        if embeddings.ndim != 2 or len(chunks) != embeddings.shape[0]:
            raise ValueError("Embeddings must be 2D and aligned with chunks.")
        X = embeddings.astype(np.float32)
        X = np.ascontiguousarray(X)
        norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-12
        self.Xn = X / norms
        self.dim = self.Xn.shape[1]
        self.docs = chunks
        self.model_name = model_name

    def _embed_query(self, query: str) -> np.ndarray:
        if not self.model_name:
            raise RuntimeError("Index not initialized with a model_name. Call fit() first.")
        resp = client.embeddings.create(model=self.model_name, input=[query])
        q = np.array(resp.data[0].embedding, dtype=np.float32)
        if q.shape[0] != self.dim:
            raise ValueError(f"Query dim {q.shape[0]} != index dim {self.dim}. "
                             f"Use the same embedding model as indexing.")
        q = q / (np.linalg.norm(q) + 1e-12)
        return q  # (D,)

    def search(self, query: str, top_k: int = 200) -> list[dict]:
        if self.Xn is None:
            raise RuntimeError("Index empty. Call fit() first.")

        # embed + normalize query
        q = self._embed_query(query)                 # (D,)
        sims = self.Xn @ q                           # (N,)
        n = sims.shape[0]
        k = max(1, min(top_k, n))                    # clamp to [1, N]

        # For small N, argsort is fine and simpler
        top_idx = np.argsort(sims)[::-1][:k]

        results = []
        for rank, i in enumerate(top_idx, 1):
            d = self.docs[i]
            results.append({
                "rank": rank,
                "dense_score": float(sims[i]),
                "resume_id": (d.metadata or {}).get("resume_id"),
                "chunk_id": (d.metadata or {}).get("chunk_id"),
                "preview": d.page_content[:400],
            })
        return results



In [None]:
#checking dense retrieval 
dense = DenseIndexSimple()
dense.fit(chunks, embeddings, model_name="text-embedding-3-small")

query = "mumbai university"
dense_hits = dense.search(query, top_k=200)

print(f"Dense hits: {len(dense_hits)}")
for h in dense_hits[:5]:
    print(f"[{h['rank']:>2}] {h['dense_score']:.4f} resume={h['resume_id']} chunk={h['chunk_id']}")


Dense hits: 200
[ 1] 0.4335 resume=VES_Ganesh_Deulkar_Resume - Ganesh Deulkar chunk=1
[ 2] 0.3864 resume=SHAHNAWAZ_RESUME (4) - shahnawaz Shaikh chunk=2
[ 3] 0.3800 resume=Saaquib Motiwala Resume-6 (1) - saaquib motiwala chunk=3
[ 4] 0.3674 resume=Mohit_CV - Mohit Lohani chunk=1
[ 5] 0.3562 resume=Ankit Patil Resume - Ankit Patil chunk=1


### Combining BM25 and Dense retrieval using RRF

In [10]:
import hashlib
from typing import List, Dict, Any, Optional

def _make_key(hit: Dict[str, Any]) -> str:
    """Stable key to identify a chunk across lists."""
    rid = (hit.get("resume_id") or "").strip()
    cid = str(hit.get("chunk_id") or "").strip()
    if rid or cid:
        return f"{rid}::{cid}"
    # Fallback: hash preview if metadata missing
    prev = (hit.get("preview") or "")[:256]
    return "hash::" + hashlib.md5(prev.encode("utf-8")).hexdigest()

def rrf_fuse(
    bm25_hits: Optional[List[Dict[str, Any]]],
    dense_hits: Optional[List[Dict[str, Any]]],
    k: int = 60,
    top_k: int = 180,
    weights: Dict[str, float] = None,
) -> List[Dict[str, Any]]:
    """
    Reciprocal Rank Fusion (RRF): score = sum_s w_s * 1/(k + rank_s)
    - bm25_hits / dense_hits: lists with at least {'rank', 'resume_id', 'chunk_id'} (or 'preview').
    - k: stabilization constant (common choices: 60, 100).
    - top_k: number of fused results to return.
    - weights: optional per-source weights, e.g., {'bm25': 1.0, 'dense': 1.0}
    """
    weights = weights or {"bm25": 1.0, "dense": 1.0}
    pool: Dict[str, Dict[str, Any]] = {}

    def add_source(hits: Optional[List[Dict[str, Any]]], label: str):
        if not hits:
            return
        for h in hits:
            key = _make_key(h)
            rec = pool.setdefault(key, {
                "resume_id": h.get("resume_id"),
                "chunk_id": h.get("chunk_id"),
                "preview": h.get("preview"),
                # keep original per-source info if present
                "bm25_rank": None, "bm25_score": None,
                "dense_rank": None, "dense_score": None,
                "rrf_score": 0.0,
            })
            r = h.get("rank")
            if isinstance(r, int) and r >= 1:
                rec["rrf_score"] += weights.get(label, 1.0) * (1.0 / (k + r))
            # stash per-source details (first occurrence wins)
            rank_key = f"{label}_rank"
            score_key = f"{label}_score"
            if rec[rank_key] is None:
                rec[rank_key] = r
            if rec[score_key] is None:
                # hit may have 'bm25_score' or 'dense_score'
                val = h.get(score_key) or h.get("bm25_score") or h.get("dense_score")
                rec[score_key] = float(val) if val is not None else None

    add_source(bm25_hits, "bm25")
    add_source(dense_hits, "dense")

    fused = sorted(pool.values(), key=lambda x: x["rrf_score"], reverse=True)[:top_k]
    # add final rank
    for i, rec in enumerate(fused, 1):
        rec["rank"] = i
    return fused


In [None]:
# Checking RRF fusion
query="Urban Water Logging Detection for Timely Intervention and Mitigation"
bm25_hits = bm25.search(query, top_k=200)
dense_hits = dense.search(query, top_k=200)

fused = rrf_fuse(bm25_hits, dense_hits, k=60, top_k=180, weights={"bm25": 2.0, "dense": 1.0})
for r in fused[:10]:
    print(f"[{r['rank']:>2}] RRF={r['rrf_score']:.5f}  bm25_r={r['bm25_rank']}  dense_r={r['dense_rank']}")
    print("   ", (r["preview"] or "").splitlines()[0][:120], "...")


[ 1] RRF=0.04918  bm25_r=1  dense_r=1
    ## SELECTED PROJECTS ...
[ 2] RRF=0.04788  bm25_r=3  dense_r=2
    ## Projects ...
[ 3] RRF=0.04572  bm25_r=7  dense_r=3
    ## (Google Earth Engine, EDA, LSTM, Project Management)                                                                  ...
[ 4] RRF=0.04559  bm25_r=2  dense_r=15
    ## ACHIEVEMENTS ...
[ 5] RRF=0.04505  bm25_r=5  dense_r=10
    ## PERSONAL PROJECTS ...
[ 6] RRF=0.04501  bm25_r=6  dense_r=8
    ## Professional Experience ...
[ 7] RRF=0.04340  bm25_r=12  dense_r=4
    ## COMPETITION/CONFERENCE ...
[ 8] RRF=0.04156  bm25_r=10  dense_r=17
    ## Projects ...
[ 9] RRF=0.04149  bm25_r=9  dense_r=20
    ## Experience ...
[10] RRF=0.04116  bm25_r=15  dense_r=9
    ## Publications ...


### Reranking using Cross Encoder

In [13]:
from sentence_transformers import CrossEncoder
import torch
with open('resume_chunks_openai.pkl', 'rb') as f:
    all_chunks = pickle.load(f)
# Load cross-encoder reranker
reranker = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

def rerank_with_cross_encoder(query: str, fused_results: List[Dict[str, Any]], all_chunks: List[Document], top_k: int = 180) -> List[Dict[str, Any]]:
    """Rerank fused results using cross-encoder for better relevance"""

    # Get full chunk content for each result
    pairs = []
    valid_results = []

    for r in fused_results:
        resume_id = r.get("resume_id")
        chunk_id = r.get("chunk_id")

        # Find matching chunk
        matching_chunk = next(
            (c for c in all_chunks if c.metadata.get("resume_id") == resume_id and c.metadata.get("chunk_id") == chunk_id),
            None
        )

        if matching_chunk:
            pairs.append((query, matching_chunk.page_content))
            valid_results.append(r)

    # Get cross-encoder scores
    ce_scores = reranker.predict(pairs).tolist()

    # Add cross-encoder scores to results
    for result, score in zip(valid_results, ce_scores):
        result["ce_score"] = float(score)

    # Sort by cross-encoder score
    reranked = sorted(valid_results, key=lambda x: x["ce_score"], reverse=True)[:top_k]

    # Update ranks
    for i, r in enumerate(reranked, 1):
        r["rerank_position"] = i

    return reranked



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
#Checking RERANKING USING CROSS ENCODER
query = "Candidates who are strictly ML people, no development"
bm25_hits = bm25.search(query, top_k=200)
dense_hits = dense.search(query, top_k=200)
fused = rrf_fuse(bm25_hits, dense_hits, k=60, top_k=180)

# Rerank top 50 results to get final top 10
reranked = rerank_with_cross_encoder(query, fused, all_chunks, top_k=180)
for r in reranked[:5]:
  print(r.get("resume_id", ""))
# print("RERANKED RESULTS:")
# for r in reranked:
#     print(f"[{r['rerank_position']:>2}] CE={r['ce_score']:.4f} RRF={r['rrf_score']:.5f} resume={r['resume_id']}")

Deekshith_Kuchana (1) - Deekshith kuchana
VES_Ganesh_Deulkar_Resume - Ganesh Deulkar
Harshit_Resume_one (1) - HARSHIT SRIVASTAVA
resume_intern-1 - Akshitha B
Dev Vrat Sharma - Resume - Dev Vrat Sharma


### Generating LLM Summary

In [None]:

from openai import OpenAI
from typing import List, Dict, Any

os.environ["OPENAI_API_KEY"] = (
    userdata.get("OPENAI_API_KEY") or userdata.get("openai_api_key")
)
client=OpenAI()

with open('resume_chunks_openai.pkl', 'rb') as f:
    all_chunks = pickle.load(f)

def gather_full_resumes(fused_results: List[Dict[str, Any]], all_chunks: List[Document]) -> Dict[str, str]:
    """Collect all chunks for each resume_id mentioned in fused results."""
    resume_ids = {r["resume_id"] for r in fused_results if r.get("resume_id")}

    resume_content = {}
    for rid in resume_ids:
        # Get all chunks for this resume
        resume_chunks = [c.page_content for c in all_chunks if c.metadata.get("resume_id") == rid]
        resume_content[rid] = "\n\n".join(resume_chunks)

    return resume_content

def summarize_resumes_with_llm(
    query: str,
    fused_results: List[Dict[str, Any]],
    all_chunks: List[Document],
    top_n: int = 5,
    max_resume_chars: int = 6000,
    max_context_chars: int = 2000
):
    """Generate summaries with both retrieved chunks and full resume context."""

    # Get top N unique resume IDs
    top_resume_ids = []
    resume_matched_chunks = {}  # Store which chunks matched for each resume

    for r in fused_results:
        rid = r.get("resume_id")
        if rid:
            # Track matched chunks for this resume
            if rid not in resume_matched_chunks:
                resume_matched_chunks[rid] = []
            resume_matched_chunks[rid].append(r)

            # Add to top list if not already there
            if rid not in top_resume_ids:
                top_resume_ids.append(rid)

            if len(top_resume_ids) >= top_n:
                break

    # Gather full resume content
    resume_content = gather_full_resumes(fused_results, all_chunks)

    summaries = []
    for rid in top_resume_ids:
        full_resume = resume_content.get(rid, "")
        matched_chunks = resume_matched_chunks.get(rid, [])

        # Get the actual text of matched chunks
        matched_texts = []
        for m in matched_chunks[:3]:  # Top 3 matched chunks
            chunk = next(
                (c for c in all_chunks
                 if c.metadata.get("resume_id") == rid
                 and c.metadata.get("chunk_id") == m.get("chunk_id")),
                None
            )
            if chunk:
                matched_texts.append(chunk.page_content)  # Change here to limit the size of chunks being added to the context

        # Build context string
        matched_context = "\n---\n".join(matched_texts) if matched_texts else "N/A"

        # Smart truncation: prioritize important sections
        resume_sections = split_resume_into_sections(full_resume)
        truncated_resume = smart_truncate_resume(resume_sections, max_resume_chars)

        prompt = f"""You are a recruiter assistant. Analyze this resume and provide:
1. A brief summary of the candidate's profile (2-3 sentences)
2. How this candidate matches the query: "{query}"
3. Key strengths relevant to the query

Resume ID: {rid}

MOST RELEVANT SECTIONS (from search):
{matched_context[:max_context_chars]}

FULL RESUME:
{truncated_resume}

Focus on the relevant sections above, but use the full resume for complete context.
Provide a concise response."""

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=400
        )

        summary = response.choices[0].message.content
        summaries.append({
            "resume_id": rid,
            "summary": summary,
            "rrf_score": next((r["rrf_score"] for r in fused_results if r.get("resume_id") == rid), None),
            "ce_score": next((r.get("ce_score") for r in fused_results if r.get("resume_id") == rid), None),
            "matched_sections": len(matched_chunks)
        })

    return summaries


def split_resume_into_sections(resume_text: str) -> Dict[str, str]:
    """Split resume into sections based on markdown headers."""
    sections = {}
    current_section = "header"
    current_content = []

    for line in resume_text.split('\n'):
        if line.startswith('##'):
            # Save previous section
            if current_content:
                sections[current_section] = '\n'.join(current_content)
            # Start new section
            current_section = line.strip('# ').lower()
            current_content = [line]
        else:
            current_content.append(line)

    # Save last section
    if current_content:
        sections[current_section] = '\n'.join(current_content)

    return sections


def smart_truncate_resume(sections: Dict[str, str], max_chars: int) -> str:
    """Intelligently truncate resume, prioritizing important sections."""

    # Priority order for resume sections
    priority_sections = [
        'education', 'experience', 'work experience', 'skills',
        'projects', 'summary','publications', 'certifications','about',
        'achievements', 'awards'
    ]

    result = []
    current_length = 0

    # Add sections by priority
    for section_name in priority_sections:
        # Find matching section (case-insensitive partial match)
        for key, content in sections.items():
            if section_name in key.lower():
                section_length = len(content)
                if current_length + section_length <= max_chars:
                    result.append(content)
                    current_length += section_length
                elif current_length < max_chars:
                    # Add partial content
                    remaining = max_chars - current_length
                    result.append(content[:remaining] + "\n...[truncated]")
                    current_length = max_chars
                break

        if current_length >= max_chars:
            break

    # Add any remaining important sections not in priority list
    for key, content in sections.items():
        if current_length >= max_chars:
            break
        if key not in [s for s in priority_sections]:
            section_length = len(content)
            if current_length + section_length <= max_chars:
                result.append(content)
                current_length += section_length

    return '\n\n'.join(result)

In [81]:
# reranked

In [28]:
# Usage
query = "Find people comfortable with PyTorch for deep learning and who can explain model training and evaluation trade-offs."
summaries = summarize_resumes_with_llm(query, reranked, all_chunks, top_n=5)

for i, s in enumerate(summaries, 1):
    # print(f"\n{'='*60}")
    # print(f"RANK {i} | Resume ID: {s['resume_id']} | RRF Score: {s['rrf_score']:.5f}")
    # print(f"{'='*60}")

    print(s['summary'])

## Objective

Aspiring machine learning engineer with a strong foundation in applied mathematics, specializing in NLP, deep learning, and generative AI. Highly skilled in leveraging machine learning frameworks and data science tools to solve complex problems. Seeking opportunities to apply technical expertise and research-driven insights to innovative AI projects and contribute to impactful solutions.
## SUMMARY

As a future data scientist, I have a strong background in deep learning, machine learning, and data analytics in addition to real-world backend API development expertise. I have successfully created and put into use predictive models and user-friendly online apps that have successfully solved real-world issues. My excitement for investigating novel approaches and procedures in the area is fueled by my love for data-driven decision-making.

I'm looking for internships right now where I can use my technical expertise and add to creative initiatives. I can take on challenging tas