# Intelligent Resume Screening and Job Matching Analytics

This notebook provides analysis and insights into the resume screening system.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import os
import sys

# Add src to path
sys.path.append('../src')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load datasets
resumes_df = pd.read_csv('../data/raw/resume_dataset.csv')
jobs_df = pd.read_csv('../data/raw/job_description_dataset.csv')

print("Resume Dataset Shape:", resumes_df.shape)
print("Job Dataset Shape:", jobs_df.shape)
print("\nResume Dataset Columns:", list(resumes_df.columns))
print("Job Dataset Columns:", list(jobs_df.columns))

In [None]:
# Basic statistics
print("Resume Statistics:")
print(resumes_df.describe())
print("\nJob Statistics:")
print(jobs_df.describe())

In [None]:
# Visualize experience distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
resumes_df['experience_level'].value_counts().plot(kind='bar')
plt.title('Resume Experience Level Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
jobs_df['experience_level'].value_counts().plot(kind='bar')
plt.title('Job Experience Level Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Analyze skills
from collections import Counter

# Flatten skills from resumes
all_resume_skills = []
for skills in resumes_df['skills']:
    if isinstance(skills, str):
        all_resume_skills.extend([s.strip() for s in skills.split(',')])

resume_skill_counts = Counter(all_resume_skills)

# Flatten skills from jobs
all_job_skills = []
for skills in jobs_df['required_skills']:
    if isinstance(skills, str):
        all_job_skills.extend([s.strip() for s in skills.split(',')])

job_skill_counts = Counter(all_job_skills)

# Plot top skills
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
top_resume_skills = dict(resume_skill_counts.most_common(10))
plt.bar(top_resume_skills.keys(), top_resume_skills.values())
plt.title('Top 10 Skills in Resumes')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
top_job_skills = dict(job_skill_counts.most_common(10))
plt.bar(top_job_skills.keys(), top_job_skills.values())
plt.title('Top 10 Required Skills in Jobs')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Load and analyze model performance
from src.ml.evaluate import load_model, evaluate_model
from sklearn.model_selection import train_test_split
import pandas as pd

# Create test data (simplified)
test_data = []
for _, resume in resumes_df.head(20).iterrows():
    for _, job in jobs_df.head(10).iterrows():
        combined_text = resume['resume_text'] + " " + job['job_description']
        resume_skills = set(resume['skills'].split(', '))
        job_skills = set(job['required_skills'].split(', '))
        label = 1 if len(resume_skills.intersection(job_skills)) > 0 else 0
        test_data.append({"text": combined_text, "label": label})

test_df = pd.DataFrame(test_data)
X_test = test_df['text']
y_test = test_df['label']

# Evaluate model
eval_results = evaluate_model(X_test, y_test)
print("Model Evaluation Results:")
print(f"Accuracy: {eval_results['accuracy']:.4f}")
print(f"Precision: {eval_results['precision']:.4f}")
print(f"Recall: {eval_results['recall']:.4f}")
print(f"F1 Score: {eval_results['f1_score']:.4f}")
print("\nClassification Report:")
print(eval_results['classification_report'])

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

model = load_model()
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Match', 'Match'], 
            yticklabels=['No Match', 'Match'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Analyze match predictions
from src.ml.predict import predict_match

# Sample predictions
sample_matches = []
for i in range(min(10, len(resumes_df))):
    resume = resumes_df.iloc[i]
    job = jobs_df.iloc[i % len(jobs_df)]
    
    score = predict_match(resume['resume_text'], job['job_description'])
    sample_matches.append({
        'resume_id': resume['id'],
        'job_id': job['id'],
        'match_score': score,
        'resume_title': resume['name'],
        'job_title': job['title']
    })

matches_df = pd.DataFrame(sample_matches)
print("Sample Match Predictions:")
print(matches_df.sort_values('match_score', ascending=False))

In [None]:
# Visualize match score distribution
plt.figure(figsize=(10, 6))
plt.hist(matches_df['match_score'], bins=20, edgecolor='black')
plt.title('Distribution of Match Scores')
plt.xlabel('Match Score')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Correlation analysis
correlation_data = []
for _, match in matches_df.iterrows():
    resume = resumes_df[resumes_df['id'] == match['resume_id']].iloc[0]
    job = jobs_df[jobs_df['id'] == match['job_id']].iloc[0]
    
    resume_exp = {'Entry Level': 1, 'Mid Level': 2, 'Senior Level': 3, 'Lead': 4, 'Principal': 5}.get(resume['experience_level'], 1)
    job_exp = {'Entry Level': 1, 'Mid Level': 2, 'Senior Level': 3, 'Lead': 4, 'Principal': 5}.get(job['experience_level'], 1)
    
    correlation_data.append({
        'match_score': match['match_score'],
        'experience_match': 1 if resume_exp >= job_exp else 0,
        'salary': job['salary']
    })

corr_df = pd.DataFrame(correlation_data)
correlation_matrix = corr_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## Summary

This notebook provides comprehensive analysis of the resume screening system including:
- Dataset exploration and statistics
- Skills analysis
- Model performance evaluation
- Match prediction analysis
- Correlation insights

Key findings:
- [Add your key insights here based on the analysis]