In [1]:
# Cell 1: Imports and Setup

import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

from utils.preprocess import prepare_document
from utils.universal_parser import parse_cv_text, extract_requirements
from utils.matcher import rule_based_score
from utils.decision import make_decision, save_results_to_csv
from utils.sentiment import classify_sentiment
from utils.embedding import compute_similarity
from utils.rl_agent import SimpleRLAgent

# Matplotlib style
sns.set(style="whitegrid")


In [2]:
# Cell 2: Load and preprocess CVs

cv_folder = "./data/sample_cvs"
cv_files = glob.glob(os.path.join(cv_folder, "*.pdf"))
print(f"Found {len(cv_files)} CV PDF files.")

# Extract and clean CV texts
cv_texts = [prepare_document(f, remove_stopwords=True) for f in cv_files]

# Parse CVs for structured info
parsed_cvs = [parse_cv_text(os.path.basename(f), text) for f, text in zip(cv_files, cv_texts)]

print("Sample parsed CV data:")
parsed_cvs[0]


Found 18 CV PDF files.
Sample parsed CV data:


{'filename': 'cv1.pdf.pdf',
 'domain': 'IT',
 'degree': 'BTECH',
 'skills': ['data analysis',
  'deep learning',
  'financial analysis',
  'machine learning',
  'numpy',
  'pandas',
  'python',
  'tensorflow'],
 'experience': 0,
 'text': 'talha usmani talhausmanigmailcom mumbai hehim profile computer science engineering graduate strong foundation programming software development proficient python practical experience gained internships academic projects passionate problemsolving building realworld tech solutions keen interest emerging technologies like machine learning actively seeking opportunities contribute grow dynamic innovationdriven environment professional experience – devskillhub python programming intern completed handson internship focused python programming developed practical coding skills working realworld projects solving algorithmic challenges gained experience core python concepts data structures object oriented programming file handling libraries like numpy pandas col

In [3]:
# Cell 3: Load and preprocess JDs

jd_folder = "./data/sample_jds"
jd_files = glob.glob(os.path.join(jd_folder, "*.txt"))
print(f"Found {len(jd_files)} JD text files.")

jd_texts = [prepare_document(f, remove_stopwords=True) for f in jd_files]
parsed_jds = [extract_requirements(text) for text in jd_texts]

print("Sample parsed JD data:")
parsed_jds[0]


Found 3 JD text files.
Sample parsed JD data:


{'domain': 'IT',
 'degrees': ['BTECH', 'MTECH'],
 'skills': ['analysis proficiency sql database management familiarity cloud platforms aws',
  'deep learning',
  'engineer location bengaluru india experience',
  'feature',
  'frameworks required skills',
  'hyperparameter tuning',
  'java',
  'large datasets',
  'machine learning',
  'machine learning models',
  'models applications',
  'nlp',
  'numpy',
  'pandas',
  'position machine',
  'production',
  'python',
  'pytorch',
  'sql',
  'techniques collaborate software engineers',
  'tensorflow',
  'tensorflow pytorch',
  'understanding data structures algorithms experience nlp computer vision',
  'updated latest advancements',
  'years responsibilities design'],
 'text': 'position machine learning engineer location bengaluru india experience – years responsibilities design develop deploy machine learning models production work large datasets perform data cleaning preprocessing feature engineering optimize model performance using hyp

In [4]:
# Cell 4: Load HR feedbacks and run sentiment classification

feedback_file = "./data/feedbacks.txt"
with open(feedback_file, "r", encoding="utf-8") as f:
    feedbacks = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(feedbacks)} feedback entries.")

# Classify sentiments
feedback_sentiments = [classify_sentiment(fb) for fb in feedbacks]
print("Sample sentiment results:")
feedback_sentiments[:5]


Loaded 20 feedback entries.
Sample sentiment results:


[('Neutral', 0.0),
 ('Positive', 0.49),
 ('Neutral', 0.13),
 ('Neutral', 0.08),
 ('Positive', 0.46)]

In [5]:
# Cell 5: Select a JD to evaluate candidates against

jd_index = 0  # Change to evaluate other JDs
jd_text = jd_texts[jd_index]
jd_req = parsed_jds[jd_index]

print(f"Selected JD {jd_index+1} domain: {jd_req['domain']}")
print(f"Required degrees: {jd_req['degrees']}")
print(f"Key skills: {jd_req['skills'][:10]} ...")  # show first 10


Selected JD 1 domain: IT
Required degrees: ['BTECH', 'MTECH']
Key skills: ['analysis proficiency sql database management familiarity cloud platforms aws', 'deep learning', 'engineer location bengaluru india experience', 'feature', 'frameworks required skills', 'hyperparameter tuning', 'java', 'large datasets', 'machine learning', 'machine learning models'] ...


In [6]:
# Cell 6: Compute similarity scores & apply rule-based score enhancement

base_sim_scores = compute_similarity(cv_texts, jd_text)
adjusted_scores = [rule_based_score(sim, cv_texts[i], jd_req['skills']) for i, sim in enumerate(base_sim_scores)]

# Display top 5 scores
top_scores = sorted(zip(cv_files, adjusted_scores), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 candidates by adjusted similarity score:")
for fname, score in top_scores:
    print(f"{os.path.basename(fname)}: {score:.4f}")


Top 5 candidates by adjusted similarity score:
cv17 (14).pdf: 0.3668
cv1.pdf.pdf: 0.3616
cv15.pdf.pdf: 0.2974
cv14.pdf.pdf: 0.2954
cv12.pdf.pdf: 0.2941


In [7]:
# Cell 7: Initialize RL Agent and run full decision pipeline

actions = ["Strong Hire", "Consider", "Needs Review", "Reject"]
rl_agent = SimpleRLAgent(actions)

similarity_threshold = 0.08
skill_match_threshold = 0.1

results = make_decision(
    cv_texts=cv_texts,
    jd_text=jd_text,
    feedbacks=feedbacks,
    rl_agent=rl_agent,
    similarity_threshold=similarity_threshold,
    skill_match_threshold=skill_match_threshold
)
print("Rewards history:", rl_agent.get_reward_history())

results_df = pd.DataFrame(results)
results_df.head()


[CV 1] → Needs Review | Sim=16.2%, Sentiment=Neutral(0.00), DegreeMatch=True, Skills=24.0%, RL Action=Needs Review, RL Confidence=0%
[CV 2] → Reject | ❌ Similarity score below 0.08
[CV 3] → Reject | ❌ Similarity score below 0.08
[CV 4] → Strong Hire | Sim=9.4%, Sentiment=Neutral(0.08), DegreeMatch=True, Skills=20.0%, RL Action=Strong Hire, RL Confidence=0%
[CV 5] → Strong Hire | Sim=10.3%, Sentiment=Positive(0.46), DegreeMatch=True, Skills=12.0%, RL Action=Strong Hire, RL Confidence=0%
[CV 6] → Strong Hire | Sim=9.5%, Sentiment=Neutral(0.32), DegreeMatch=True, Skills=20.0%, RL Action=Strong Hire, RL Confidence=100.0%
[CV 7] → Strong Hire | Sim=9.7%, Sentiment=Negative(-0.42), DegreeMatch=True, Skills=20.0%, RL Action=Strong Hire, RL Confidence=0%
[CV 8] → Reject | ❌ Similarity score below 0.08
[CV 9] → Strong Hire | Sim=11.7%, Sentiment=Negative(-0.48), DegreeMatch=True, Skills=24.0%, RL Action=Strong Hire, RL Confidence=100.0%
[CV 10] → Reject | ❌ Similarity score below 0.08
[CV 11] →

Unnamed: 0,cv_index,similarity_score_%,skill_match_%,degree_match,match_score_%,sentiment_label,sentiment_score,rl_confidence_%,decision,explanation
0,1,16.2,24.0,True,46.7,Neutral,0.0,0.0,Needs Review,"Sim=16.2%, Sentiment=Neutral(0.00), DegreeMatc..."
1,2,7.5,20.0,True,42.5,Positive,0.49,0.0,Reject,❌ Similarity score below 0.08
2,3,7.3,16.0,True,41.1,Neutral,0.13,0.0,Reject,❌ Similarity score below 0.08
3,4,9.4,20.0,True,43.1,Neutral,0.08,0.0,Strong Hire,"Sim=9.4%, Sentiment=Neutral(0.08), DegreeMatch..."
4,5,10.3,12.0,True,40.8,Positive,0.46,0.0,Strong Hire,"Sim=10.3%, Sentiment=Positive(0.46), DegreeMat..."


In [9]:
# Cell 8: Save results to CSV and json
def save_results_to_json(results, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
save_results_to_csv(results, filename="final_decisions.csv")
save_results_to_json(results, "final_decisions.json")
print("Results saved to final_decisions.csv")


Results saved to final_decisions.csv


In [None]:
# Cell 9: Visualization - Match Scores distribution

plt.figure(figsize=(10,6))
sns.histplot(results_df['match_score_%'], bins=20, kde=True)
plt.title("Distribution of Candidate Match Scores (%)")
plt.xlabel("Match Score (%)")
plt.ylabel("Count")
plt.show()


In [None]:
# Cell 10: Visualization - Decision counts

plt.figure(figsize=(8,5))
sns.countplot(data=results_df, x='decision', order=actions)
plt.title("Decision Counts by RL Agent")
plt.xlabel("Decision")
plt.ylabel("Number of Candidates")
plt.show()


In [None]:
print("Rewards history:", rl_agent.get_reward_history())


In [None]:
# Cell 11: Visualization - RL Agent Reward History (simulated)

# RL Agent rewards recorded during decision making
reward_history = rl_agent.get_reward_history()

plt.figure(figsize=(12,5))
plt.plot(reward_history, marker='o', linestyle='-', color='teal')
plt.title("RL Agent Reward History Over Time")
plt.xlabel("Decision Step")
plt.ylabel("Reward")
plt.grid(True)
plt.show()


In [None]:
# Debug snippet to check similarity, skills, and RL agent state

# 1. Raw similarity scores
sim_scores = compute_similarity(cv_texts, jd_text)
print("\nRaw similarity scores:")
for i, score in enumerate(sim_scores):
    print(f"CV {i+1}: {score:.4f}")

# 2. Skills from CVs and JD
print("\nExtracted skills from first 5 CVs:")
for i in range(min(5, len(cv_texts))):
    parsed = parse_cv_text(f"cv_{i+1}", cv_texts[i])
    print(f"CV {i+1} skills: {parsed['skills']}")
print("JD required skills:", jd_req['skills'])

# 3. Skill bonus check
print("\nSkill bonuses and total scores:")
for i in range(len(cv_texts)):
    bonus = 0.0
    for skill in jd_req['skills']:
        if skill.lower() in cv_texts[i].lower():
            bonus += 0.05
    total = min(sim_scores[i] + bonus, 1.0)
    print(f"CV {i+1}: base={sim_scores[i]:.4f}, bonus={bonus:.4f}, total={total:.4f}")

# 4. RL agent Q-table snapshot
print("\nRL agent Q-table snapshot:")
rl_agent.print_q_table()

# 5. Thresholds for decision
print(f"\nCurrent thresholds: similarity={similarity_threshold}, skill match={skill_match_threshold}")
