In [8]:
import pdfplumber

def extract_text_from_pdf(file_path: str) -> str:
    """
    Safely extracts text from a PDF. Skips pages that cannot be processed.
    """
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            try:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            except Exception as e:
                print(f"⚠️ Skipping page {i+1} due to error: {e}")
                continue
    return text.strip()


In [12]:
import os
from dotenv import load_dotenv
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage

load_dotenv()

llm = ChatOpenAI(
    model="mistral-medium",
    openai_api_key=os.getenv("MISTRAL_API_KEY"),
    openai_api_base="https://api.mistral.ai/v1",
    temperature=0.2
)

prompt_template = PromptTemplate(
    input_variables=["jd_text"],
    template="""
You are an HR assistant. Read the job description and **infer** the following:

1. Bullet list of REQUIRED skills  
2. Bullet list of PREFERRED (nice-to-have) skills  

Even if the skills aren't explicitly written, guess based on the responsibilities and context.

Job Description:
---
{jd_text}
---
"""
)


def extract_skills_from_jd(jd_text: str) -> str:
    """
    Extracts skills from a JD using Mistral API via LangChain.
    Returns a clean bullet-format string.
    """
    prompt = prompt_template.format(jd_text=jd_text)
    response = llm.invoke([
        SystemMessage(content="You are an HR assistant skilled in parsing job descriptions."),
        HumanMessage(content=prompt)
    ])
    return response.content


In [13]:
jd_text = extract_text_from_pdf("sample_data/jd.pdf")

# Step 2: Extract required/preferred skills using Mistral
skills = extract_skills_from_jd(jd_text)

# Step 3: Print the result
print("🔍 Extracted Skills from JD:\n")
print(skills)


🔍 Extracted Skills from JD:

### Required Skills:
- Proficiency in Python programming
- Understanding of machine learning concepts and model training
- Experience with data analysis and data pipelines
- Knowledge of cloud infrastructure and services
- Ability to write production-quality code
- Familiarity with machine learning frameworks (e.g., TensorFlow, PyTorch)
- Basic understanding of software development principles
- Strong problem-solving skills
- Good communication and teamwork abilities

### Preferred (Nice-to-Have) Skills:
- Experience with Git for version control
- Knowledge of Docker for containerization
- Familiarity with SQL databases
- Experience with automated deployment workflows
- Understanding of scalable ML pipeline development
- Previous experience collaborating with research scientists and backend developers
- Knowledge of modern machine learning frameworks and tools
- Experience in a similar internship or project-based role


In [14]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

def clean_text(text: str) -> str:
    """
    Lowercase, remove special chars, and normalize whitespace.
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def match_resume_to_jd(resume_text: str, jd_skills_text: str) -> float:
    """
    Returns cosine similarity between resume and JD skill descriptions.
    """
    resume_clean = clean_text(resume_text)
    jd_clean = clean_text(jd_skills_text)

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([jd_clean, resume_clean])
    similarity = cosine_similarity(vectors[0], vectors[1])
    return round(float(similarity[0][0]), 2)


[nltk_data] Downloading package punkt to C:\Users\Varun
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
resume_text = extract_text_from_pdf("resumes/resume1.pdf")


In [20]:
score = match_resume_to_jd(resume_text, skills)
print(f"🧠 Resume Match Score: {score}")


🧠 Resume Match Score: 0.25


In [34]:
# agents/screening_agent.py

import os
from dotenv import load_dotenv
from langchain.agents import AgentType, initialize_agent
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

load_dotenv()

llm = ChatOpenAI(
    model="mistral-medium",
    openai_api_key=os.getenv("MISTRAL_API_KEY"),
    openai_api_base="https://api.mistral.ai/v1",
    temperature=0.3
)

# ⏬ Prompt Template
feedback_prompt = PromptTemplate.from_template("""
You are an expert HR recruiter.

Given the following job description and a candidate's resume, provide a concise and professional feedback.

Mention:
- Key strengths of the candidate relevant to the JD
- Any major skill gaps or missing experience
- How well the resume matches the JD overall

Job Description:
{jd_text}

Resume:
{resume_text}

Your feedback (1-2 sentences):
""")

# 🔁 LLM Chain
feedback_chain = LLMChain(llm=llm, prompt=feedback_prompt)


In [36]:
from agents.screening_agent import feedback_chain

jd_text = extract_text_from_pdf("sample_data/jd.pdf")
resume_text = extract_text_from_pdf("sample_data/resumes/resume1.pdf")

feedback = feedback_chain.invoke({
    "jd_text": jd_text,
    "resume_text": resume_text[:2500]  
})["text"]

print("💬 Feedback:\n", feedback.strip())


💬 Feedback:
 **Feedback:**

Amit Sharma demonstrates strong alignment with the JD, showcasing relevant skills in Python, ML (scikit-learn), SQL, and AWS, along with hands-on experience in model training and deployment. However, deeper expertise in production-grade ML pipelines and advanced cloud infrastructure could strengthen his fit; overall, his resume matches ~80% of the requirements, making him a promising candidate with minor gaps to address.

*(Note: Adjust percentages or specifics based on your evaluation scale.)*


In [37]:
import os
import heapq
from typing import List, Tuple, Dict

# Simulated outputs (You’ll replace with actual implementations)
def get_resume_text(file_path: str) -> str:
    return f"Sample text extracted from {os.path.basename(file_path)}"

def get_score(jd_skills: str, resume_text: str) -> float:
    return round(len(set(jd_skills.lower().split()) & set(resume_text.lower().split())) / 10, 2)

def get_feedback(jd_text: str, resume_text: str) -> str:
    return f"Feedback for {os.path.basename(resume_text[:50])} (truncated)"

# Priority queue logic: stores (-score, name, feedback) for max-heap behavior
def build_leaderboard(jd_text: str, jd_skills: str, resume_files: List[str]) -> List[Tuple[str, float, str]]:
    pq = []
    for file_path in resume_files:
        name = os.path.splitext(os.path.basename(file_path))[0]
        resume_text = get_resume_text(file_path)
        score = get_score(jd_skills, resume_text)
        feedback = get_feedback(jd_text, resume_text)
        heapq.heappush(pq, (-score, name, feedback))

    leaderboard = []
    while pq:
        score_neg, name, feedback = heapq.heappop(pq)
        leaderboard.append((name, -score_neg, feedback))

    return leaderboard

# Test with dummy files (replace with actual resume file paths)
sample_jd_text = "We are looking for Python, ML, SQL, communication"
sample_jd_skills = "Python ML SQL communication"
sample_resumes = ["resumes/amit_sharma.pdf", "resumes/neha_kumar.pdf", "resumes/john_doe.pdf"]

build_leaderboard(sample_jd_text, sample_jd_skills, sample_resumes)


[('amit_sharma',
  0.0,
  'Feedback for Sample text extracted from amit_sharma.pdf (truncated)'),
 ('john_doe',
  0.0,
  'Feedback for Sample text extracted from john_doe.pdf (truncated)'),
 ('neha_kumar',
  0.0,
  'Feedback for Sample text extracted from neha_kumar.pdf (truncated)')]