In [1]:
import os
import tempfile
import typing as t
import math
import shutil
import re

from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.responses import JSONResponse
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDINGS = HuggingFaceEmbeddings(model_name=EMB_MODEL, model_kwargs={"device": "cpu"})

CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
RETRIEVE_K = 6

# In-memory memory store: { jd_name: { "best": {...}, "history": [ {...}, ... ] } }
MEMORY_STORE: dict = {}

app = FastAPI(title="JD-Resume RAG Evaluator (in-memory LangServe-like)")

  EMBEDDINGS = HuggingFaceEmbeddings(model_name=EMB_MODEL, model_kwargs={"device": "cpu"})


In [3]:
def extract_pdf_text(path: str) -> str:
    reader = PdfReader(path)
    text = []
    for p in reader.pages:
        t = p.extract_text()
        if t:
            text.append(t)
    return "\n".join(text)

def chunk_text(text: str, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

def create_rag_store_from_text(text: str):
    chunks = chunk_text(text)
    if not chunks:
        return {"vectorstore": None, "chunks": []}
    vectorstore = FAISS.from_texts(texts=chunks, embedding=EMBEDDINGS)
    return {"vectorstore": vectorstore, "chunks": chunks}

def create_rag_store_from_pdf(path: str):
    text = extract_pdf_text(path)
    return create_rag_store_from_text(text)

def retrieve_top_text(store: dict, query: str, k=RETRIEVE_K) -> str:
    vs = store.get("vectorstore")
    if vs is None:
        return ""
    retriever = vs.as_retriever(search_kwargs={"k": k})
    # modern LangChain retriever is a runnable - use .invoke()
    docs = retriever.invoke(query)
    return "\n".join([d.page_content for d in docs]) if docs else ""

In [4]:
def cosine_sim(a: t.List[float], b: t.List[float]) -> float:
    # simple cos similarity
    dot = sum(x*y for x,y in zip(a,b))
    na = math.sqrt(sum(x*x for x in a))
    nb = math.sqrt(sum(x*x for x in b))
    if na == 0 or nb == 0:
        return 0.0
    return dot / (na*nb)

def embed_text(text: str):
    # HuggingFaceEmbeddings wrapper provides embed_query or embed_documents methods
    # Use embed_query for a single vector
    if not text:
        return []
    return EMBEDDINGS.embed_query(text)

def extract_keywords(text: str, top_n=20):
    # naive keyword extraction: most common words after filtering stopwords / short tokens
    if not text:
        return []
    tokens = re.findall(r"[A-Za-z+#\.\-]+", text.lower())
    stop = {"the","and","for","with","that","this","from","your","you","have","are","will","his","her","our","a","an","to","in","on","of","by"}
    freq = {}
    for tkn in tokens:
        if len(tkn) < 3: continue
        if tkn in stop: continue
        freq[tkn] = freq.get(tkn,0)+1
    sorted_tokens = sorted(freq.items(), key=lambda x: -x[1])
    return [t for t,_ in sorted_tokens[:top_n]]

def skill_match_score(jd_text: str, resume_text: str) -> float:
    # compute overlap of keywords
    jd_keys = set(extract_keywords(jd_text, top_n=40))
    res_keys = set(extract_keywords(resume_text, top_n=80))
    if not jd_keys:
        return 0.0
    overlap = jd_keys.intersection(res_keys)
    return float(len(overlap)) / float(len(jd_keys))

def experience_score_from_resume(resume_text: str) -> float:
    # crude heuristic: find "X years" pattern and map to score
    m = re.search(r"(\d{1,2})\s+years?", resume_text.lower())
    if m:
        years = int(m.group(1))
        if years >= 10:
            return 1.0
        if years >= 5:
            return 0.8
        if years >= 2:
            return 0.6
        return 0.4
    # fallback: look for "senior|mid|junior"
    if re.search(r"\bsenior\b", resume_text, re.I):
        return 0.95
    if re.search(r"\bmid[- ]?level\b|\bmidlevel\b", resume_text, re.I):
        return 0.75
    if re.search(r"\bjunior\b|\bentry\b", resume_text, re.I):
        return 0.35
    return 0.5  # unknown baseline

def compute_suitability(jd_store: dict, resume_store: dict) -> dict:
    # get whole-text approximations: join top-k retrieved chunks to create short context for scoring
    jd_context = "\n".join(jd_store.get("chunks", [])[:10]) if jd_store.get("chunks") else ""
    resume_context = "\n".join(resume_store.get("chunks", [])[:20]) if resume_store.get("chunks") else ""

    # semantic similarity (embed full contexts)
    jd_vec = embed_text(jd_context) or []
    resume_vec = embed_text(resume_context) or []
    sem_sim = cosine_sim(jd_vec, resume_vec) if jd_vec and resume_vec else 0.0

    # skill match score (keyword overlap)
    skill_score = skill_match_score(jd_context, resume_context)

    # experience score heuristic
    exp_score = experience_score_from_resume(resume_context)

    # combine into single suitability score with weights (tweakable)
    # weights: semantic 40%, skill overlap 40%, experience 20%
    suitability = 0.4 * sem_sim + 0.4 * skill_score + 0.2 * exp_score

    # normalize to 0..100 for readability
    suitability_pct = round(float(suitability) * 100, 2)

    return {
        "semantic_similarity": round(sem_sim, 4),
        "skill_overlap": round(skill_score, 4),
        "experience_score": round(exp_score, 4),
        "suitability_score": suitability_pct
    }

In [5]:
@app.post("/evaluate")
async def evaluate_jd_resume(
    jd_name: str = Form(...),
    jd_file: UploadFile = File(...),
    resume_file: UploadFile = File(...),
    replace_if_better: bool = Form(True)
):
    """
    Accepts multipart form:
      - jd_name: unique identifier for JD (string)
      - jd_file: PDF for job description
      - resume_file: PDF for candidate resume
      - replace_if_better: if true, store resume as best only if its score is better.
    Returns JSON with detailed scores and comparison with memory.
    """

    # Save uploaded files temporarily
    tmp_dir = tempfile.mkdtemp(prefix="rag_evaluate_")
    try:
        jd_path = os.path.join(tmp_dir, f"jd_{jd_file.filename}")
        resume_path = os.path.join(tmp_dir, f"resume_{resume_file.filename}")

        with open(jd_path, "wb") as f:
            f.write(await jd_file.read())
        with open(resume_path, "wb") as f:
            f.write(await resume_file.read())

        # Build RAG stores
        jd_store = create_rag_store_from_pdf(jd_path)
        resume_store = create_rag_store_from_pdf(resume_path)

        # compute suitability
        metrics = compute_suitability(jd_store, resume_store)

        # Comparison with memory
        record = MEMORY_STORE.get(jd_name)
        verdict = "first_for_jd"
        replaced = False
        previous = None

        if record is None:
            # first one for this JD -> store as best
            MEMORY_STORE[jd_name] = {
                "best": {
                    "suitability_score": metrics["suitability_score"],
                    "jd_file": jd_path,
                    "resume_file": resume_path,
                    "metrics": metrics
                },
                "history": [
                    {
                        "suitability_score": metrics["suitability_score"],
                        "resume_file": resume_path,
                        "metrics": metrics
                    }
                ]
            }
            verdict = "stored_as_best"
            replaced = True
        else:
            prev_best = record["best"]["suitability_score"]
            previous = record["best"]
            if metrics["suitability_score"] > prev_best:
                verdict = "better_than_previous"
                if replace_if_better:
                    # replace best and append history
                    record["history"].append({
                        "suitability_score": metrics["suitability_score"],
                        "resume_file": resume_path,
                        "metrics": metrics
                    })
                    record["best"] = {
                        "suitability_score": metrics["suitability_score"],
                        "jd_file": jd_path,
                        "resume_file": resume_path,
                        "metrics": metrics
                    }
                    replaced = True
            elif metrics["suitability_score"] < prev_best:
                verdict = "worse_than_previous"
                record["history"].append({
                    "suitability_score": metrics["suitability_score"],
                    "resume_file": resume_path,
                    "metrics": metrics
                })
            else:
                verdict = "same_as_previous"
                record["history"].append({
                    "suitability_score": metrics["suitability_score"],
                    "resume_file": resume_path,
                    "metrics": metrics
                })

        # Build response
        resp = {
            "jd_name": jd_name,
            "metrics": metrics,
            "verdict": verdict,
            "replaced_best": replaced,
            "previous_best": previous,
            "memory_summary": {
                "best_score": MEMORY_STORE[jd_name]["best"]["suitability_score"] if jd_name in MEMORY_STORE else None,
                "history_count": len(MEMORY_STORE[jd_name]["history"]) if jd_name in MEMORY_STORE else 0
            }
        }

        return JSONResponse(resp)

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        # NOTE: keep files if stored as best (we saved resume_path into memory). If not stored, cleanup.
        # We remove tmp_dir only if it no longer contains saved best files
        # Simple safe cleanup: do not delete if stored in memory as best to keep resume path valid.
        # If first time stored_as_best we already stored those paths in MEMORY_STORE; avoid deleting them.
        # If not stored, remove tmp files.
        record_after = MEMORY_STORE.get(jd_name)
        # If record_after exists and its best resume_file is inside tmp_dir, we KEEP tmp_dir (so keep files),
        # otherwise cleanup.
        keep = False
        if record_after:
            best_path = record_after["best"].get("resume_file")
            if best_path and best_path.startswith(tmp_dir):
                keep = True
        if not keep:
            try:
                shutil.rmtree(tmp_dir)
            except Exception:
                pass