Environment Setup




In [None]:
# Install required packages
!pip install -q transformers torch scikit-learn pandas numpy matplotlib seaborn plotly
!pip install -q sentence-transformers faiss-cpu wordcloud textstat
!pip install -q streamlit pyngrok gradio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

from sentence_transformers import SentenceTransformer
import torch
import re
import json
from collections import Counter
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

print(" All packages installed successfully!")

 All packages installed successfully!


Data Loading and Exploration


In [None]:
# Load the dataset
import io
df = pd.read_csv('AI_Resume_Screening.csv')
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 1000 rows, 11 columns


In [None]:
# Display basic info
print("\nDataset Info:")
print(df.info())
print("\nFirst few rows:")
df.head()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Resume_ID               1000 non-null   int64 
 1   Name                    1000 non-null   object
 2   Skills                  1000 non-null   object
 3   Experience (Years)      1000 non-null   int64 
 4   Education               1000 non-null   object
 5   Certifications          726 non-null    object
 6   Job Role                1000 non-null   object
 7   Recruiter Decision      1000 non-null   object
 8   Salary Expectation ($)  1000 non-null   int64 
 9   Projects Count          1000 non-null   int64 
 10  AI Score (0-100)        1000 non-null   int64 
dtypes: int64(5), object(6)
memory usage: 86.1+ KB
None

First few rows:


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


DATA PREPROCESSING

In [None]:
class ResumeDataProcessor:
    def __init__(self, df):
        self.df = df.copy()
        self.processed_df = None
        self.skills_vocab = None

    def clean_data(self):
        """Clean and preprocess the dataset"""
        df = self.df.copy()

        # Handle missing values
        df['Skills'] = df['Skills'].fillna('')
        df['Certifications'] = df['Certifications'].fillna('None')

        # Standardize text columns
        df['Skills'] = df['Skills'].str.lower().str.strip()
        df['Education'] = df['Education'].str.strip()
        df['Job Role'] = df['Job Role'].str.strip()

        self.processed_df = df
        return df

    def extract_skills(self):
        """Extract and analyze skills from resumes"""
        all_skills = []
        for skills_str in self.processed_df['Skills']:
            if skills_str:
                skills = [skill.strip() for skill in skills_str.split(',')]
                all_skills.extend(skills)

        self.skills_vocab = Counter(all_skills)
        return self.skills_vocab

    def create_features(self):
        """Create additional features for ML models"""
        df = self.processed_df.copy()

        # Skills count
        df['skills_count'] = df['Skills'].apply(lambda x: len(x.split(',')) if x else 0)

        # Education encoding
        education_map = {'B.Sc': 1, 'B.Tech': 2, 'MBA': 3, 'M.Tech': 4, 'PhD': 5}
        df['education_level'] = df['Education'].map(education_map).fillna(0)

        # Has certifications
        df['has_certifications'] = (df['Certifications'] != 'None').astype(int)

        # Experience categories
        df['experience_category'] = pd.cut(df['Experience (Years)'],
                                         bins=[0, 2, 5, 10, 20],
                                         labels=['Entry', 'Mid', 'Senior', 'Expert'])

        self.processed_df = df
        return df

In [None]:
# Initialize processor
processor = ResumeDataProcessor(df)
df_clean = processor.clean_data()
skills_vocab = processor.extract_skills()
df_features = processor.create_features()

print(" Data preprocessing completed!")

 Data preprocessing completed!


EXPLORATORY DATA ANALYSIS

In [None]:
def create_eda_dashboard():
    """Create comprehensive EDA visualizations"""

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Job Role Distribution', 'Hiring Decision Analysis',
                       'Experience vs AI Score', 'Top Skills'),
        specs=[[{"type": "bar"}, {"type": "pie"}],
               [{"type": "scatter"}, {"type": "bar"}]]
    )

    # Job role distribution
    role_counts = df_features['Job Role'].value_counts()
    fig.add_trace(go.Bar(x=role_counts.values, y=role_counts.index,
                        orientation='h', name='Job Roles'), row=1, col=1)

    # Hiring decisions pie chart
    decision_counts = df_features['Recruiter Decision'].value_counts()
    fig.add_trace(go.Pie(labels=decision_counts.index, values=decision_counts.values,
                        name='Decisions'), row=1, col=2)

    # Experience vs AI Score scatter
    fig.add_trace(go.Scatter(x=df_features['Experience (Years)'],
                            y=df_features['AI Score (0-100)'],
                            mode='markers', name='Candidates'), row=2, col=1)

    # Top skills
    top_skills = dict(list(skills_vocab.most_common(10)))
    fig.add_trace(go.Bar(x=list(top_skills.keys()), y=list(top_skills.values()),
                        name='Skills'), row=2, col=2)

    fig.update_layout(height=800, title_text="Resume Screening Dataset - EDA Dashboard")
    fig.show()

create_eda_dashboard()

NLP PROCESSING & EMBEDDINGS

In [None]:
class ResumeNLPProcessor:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

    def create_resume_embeddings(self, df):
        """Create embeddings for resume content"""
        # Combine relevant text fields
        resume_text = df['Skills'] + ' ' + df['Education'] + ' ' + df['Job Role']

        # Create sentence embeddings
        embeddings = self.model.encode(resume_text.tolist())

        return embeddings

    def create_tfidf_features(self, df):
        """Create TF-IDF features from skills"""
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(df['Skills'].fillna(''))
        return tfidf_matrix.toarray()

    def extract_keywords(self, text, top_k=10):
        """Extract key terms from text"""
        words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
        word_freq = Counter(words)
        return dict(word_freq.most_common(top_k))

# Initialize NLP processor
nlp_processor = ResumeNLPProcessor()
resume_embeddings = nlp_processor.create_resume_embeddings(df_features)
tfidf_features = nlp_processor.create_tfidf_features(df_features)

print(" NLP processing completed!")
print(f"Resume embeddings shape: {resume_embeddings.shape}")
print(f"TF-IDF features shape: {tfidf_features.shape}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 NLP processing completed!
Resume embeddings shape: (1000, 384)
TF-IDF features shape: (1000, 15)


JOB MATCHING ALGORITHM

In [None]:
class JobMatcher:
    def __init__(self, df, embeddings, tfidf_features):
        self.df = df
        self.embeddings = embeddings
        self.tfidf_features = tfidf_features
        self.nlp_processor = ResumeNLPProcessor()

    def create_job_profiles(self):
        """Create synthetic job postings for matching"""
        job_profiles = {
            'AI Researcher': {
                'required_skills': 'python, tensorflow, pytorch, machine learning, nlp, deep learning',
                'min_experience': 3,
                'education_level': 4,
                'salary_range': (80000, 150000)
            },
            'Data Scientist': {
                'required_skills': 'python, sql, machine learning, statistics, pandas, numpy',
                'min_experience': 2,
                'education_level': 3,
                'salary_range': (70000, 130000)
            },
            'Software Engineer': {
                'required_skills': 'python, java, c++, react, sql, git, software development',
                'min_experience': 1,
                'education_level': 2,
                'salary_range': (60000, 120000)
            },
            'Cybersecurity Analyst': {
                'required_skills': 'cybersecurity, ethical hacking, linux, networking, security',
                'min_experience': 2,
                'education_level': 2,
                'salary_range': (65000, 110000)
            }
        }
        return job_profiles

    def calculate_job_match_score(self, candidate_idx, job_profile):
        """Calculate match score between candidate and job"""
        candidate = self.df.iloc[candidate_idx]

        # Skills matching using TF-IDF similarity
        job_embedding = self.nlp_processor.model.encode([job_profile['required_skills']])
        candidate_embedding = self.embeddings[candidate_idx].reshape(1, -1)

        skills_similarity = cosine_similarity(candidate_embedding, job_embedding)[0][0]

        # Experience match
        exp_match = min(candidate['Experience (Years)'] / job_profile['min_experience'], 1.0)

        # Education match
        edu_match = candidate['education_level'] / job_profile['education_level']

        # Salary compatibility
        salary_min, salary_max = job_profile['salary_range']
        candidate_salary = candidate['Salary Expectation ($)']
        salary_match = 1.0 if salary_min <= candidate_salary <= salary_max else 0.7

        # Weighted final score
        final_score = (skills_similarity * 0.4 +
                      exp_match * 0.3 +
                      edu_match * 0.2 +
                      salary_match * 0.1)

        return {
            'overall_score': final_score,
            'skills_similarity': skills_similarity,
            'experience_match': exp_match,
            'education_match': edu_match,
            'salary_match': salary_match
        }

    def find_best_matches(self, job_role, top_k=10):
        """Find best candidates for a job role"""
        job_profiles = self.create_job_profiles()

        if job_role not in job_profiles:
            return None

        job_profile = job_profiles[job_role]
        matches = []

        for idx in range(len(self.df)):
            match_scores = self.calculate_job_match_score(idx, job_profile)
            candidate_info = {
                'candidate_id': self.df.iloc[idx]['Resume_ID'],
                'name': self.df.iloc[idx]['Name'],
                'skills': self.df.iloc[idx]['Skills'],
                'experience': self.df.iloc[idx]['Experience (Years)'],
                'education': self.df.iloc[idx]['Education'],
                'salary_expectation': self.df.iloc[idx]['Salary Expectation ($)'],
                **match_scores
            }
            matches.append(candidate_info)

        # Sort by overall score
        matches.sort(key=lambda x: x['overall_score'], reverse=True)
        return matches[:top_k]

# Initialize job matcher
job_matcher = JobMatcher(df_features, resume_embeddings, tfidf_features)

# Test matching for each job role
job_roles = ['AI Researcher', 'Data Scientist', 'Software Engineer', 'Cybersecurity Analyst']

print("🎯 Testing Job Matching Algorithm...")
for role in job_roles:
    matches = job_matcher.find_best_matches(role, top_k=5)
    print(f"\nTop 3 matches for {role}:")
    for i, match in enumerate(matches[:3]):
        print(f"{i+1}. {match['name']} - Score: {match['overall_score']:.3f}")
        print(f"   Skills: {match['skills'][:50]}...")

🎯 Testing Job Matching Algorithm...

Top 3 matches for AI Researcher:
1. Timothy Duncan - Score: 0.983
   Skills: python, tensorflow, pytorch, nlp...
2. Sarah Jones - Score: 0.982
   Skills: python, pytorch, nlp, tensorflow...
3. Dawn Tucker - Score: 0.982
   Skills: tensorflow, nlp, pytorch, python...

Top 3 matches for Data Scientist:
1. Mark Hughes - Score: 1.040
   Skills: python, machine learning, sql...
2. Nicholas Welch - Score: 1.039
   Skills: machine learning, sql, python...
3. Deborah Ramirez - Score: 1.023
   Skills: sql, machine learning, deep learning, python...

Top 3 matches for Software Engineer:
1. Colleen Bates - Score: 1.221
   Skills: java, c++, react, sql...
2. Karina Stone - Score: 1.221
   Skills: java, c++, react, sql...
3. Michael Wilson - Score: 1.220
   Skills: java, react, sql, c++...

Top 3 matches for Cybersecurity Analyst:
1. Emily Williamson - Score: 1.227
   Skills: ethical hacking, networking, linux...
2. Melanie Jones - Score: 1.225
   Skills: ethica

MACHINE LEARNING MODELS


In [None]:
class HiringPredictor:
    def __init__(self, df, embeddings):
        self.df = df
        self.embeddings = embeddings
        self.hiring_model = None
        self.salary_model = None
        self.scaler = StandardScaler()

    def prepare_features(self):
        """Prepare features for ML models"""
        # Combine embeddings with numerical features
        numerical_features = ['Experience (Years)', 'Projects Count', 'education_level',
                            'skills_count', 'has_certifications']

        X_numerical = self.df[numerical_features].values
        X_embeddings = self.embeddings

        # Combine features
        X_combined = np.hstack([X_numerical, X_embeddings])

        return X_combined, numerical_features

    def train_hiring_predictor(self):
        """Train model to predict hiring decisions"""
        X, feature_names = self.prepare_features()
        y = (self.df['Recruiter Decision'] == 'Hire').astype(int)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Train model
        self.hiring_model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.hiring_model.fit(X_train_scaled, y_train)

        # Evaluate
        y_pred = self.hiring_model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"Hiring Prediction Accuracy: {accuracy:.3f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        return accuracy

    def train_salary_predictor(self):
        """Train model to predict salary expectations"""
        X, _ = self.prepare_features()
        y = self.df['Salary Expectation ($)'].values

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Train model
        self.salary_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        self.salary_model.fit(X_train_scaled, y_train)

        # Evaluate
        y_pred = self.salary_model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        print(f"\nSalary Prediction RMSE: ${rmse:.2f}")

        return rmse

    def predict_candidate_success(self, candidate_features):
        """Predict hiring probability and salary for a candidate"""
        if self.hiring_model is None:
            self.train_hiring_predictor()

        if self.salary_model is None:
            self.train_salary_predictor()

        candidate_scaled = self.scaler.transform([candidate_features])

        hire_prob = self.hiring_model.predict_proba(candidate_scaled)[0][1]
        salary_pred = self.salary_model.predict(candidate_scaled)[0]

        return hire_prob, salary_pred

In [None]:
# Train ML models
predictor = HiringPredictor(df_features, resume_embeddings)
hiring_accuracy = predictor.train_hiring_predictor()
salary_rmse = predictor.train_salary_predictor()

Hiring Prediction Accuracy: 0.805

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.24      0.32        38
           1       0.84      0.94      0.89       162

    accuracy                           0.81       200
   macro avg       0.66      0.59      0.60       200
weighted avg       0.77      0.81      0.78       200


Salary Prediction RMSE: $25508.65


EXPLAINABILITY & INSIGHTS

In [None]:
class ExplainableAI:
    def __init__(self, job_matcher, predictor):
        self.job_matcher = job_matcher
        self.predictor = predictor

    def explain_match(self, candidate_idx, job_role):
        """Provide detailed explanation for job match"""
        job_profiles = self.job_matcher.create_job_profiles()
        if job_role not in job_profiles:
            return None

        candidate = self.job_matcher.df.iloc[candidate_idx]
        match_scores = self.job_matcher.calculate_job_match_score(candidate_idx, job_profiles[job_role])

        explanation = {
            'candidate': {
                'name': candidate['Name'],
                'skills': candidate['Skills'],
                'experience': candidate['Experience (Years)'],
                'education': candidate['Education'],
                'salary': candidate['Salary Expectation ($)']
            },
            'match_breakdown': {
                'Skills Match': f"{match_scores['skills_similarity']:.1%}",
                'Experience Match': f"{match_scores['experience_match']:.1%}",
                'Education Match': f"{match_scores['education_match']:.1%}",
                'Salary Compatibility': f"{match_scores['salary_match']:.1%}",
                'Overall Score': f"{match_scores['overall_score']:.1%}"
            },
            'recommendations': self.generate_recommendations(candidate, job_profiles[job_role], match_scores)
        }

        return explanation

    def generate_recommendations(self, candidate, job_profile, scores):
        """Generate improvement recommendations"""
        recommendations = []

        if scores['skills_similarity'] < 0.7:
            missing_skills = set(job_profile['required_skills'].split(', ')) - set(candidate['Skills'].split(', '))
            recommendations.append(f"Consider developing skills in: {', '.join(list(missing_skills)[:3])}")

        if scores['experience_match'] < 0.8:
            recommendations.append(f"Gain more experience (current: {candidate['Experience (Years)']} years, preferred: {job_profile['min_experience']}+ years)")

        if scores['education_match'] < 0.8:
            recommendations.append("Consider pursuing advanced education or relevant certifications")

        return recommendations

    def create_match_visualization(self, candidate_idx, job_role):
        """Create visualization for match explanation"""
        explanation = self.explain_match(candidate_idx, job_role)
        if not explanation:
            return None

        # Create radar chart for match breakdown
        categories = list(explanation['match_breakdown'].keys())[:-1]  # Exclude overall score
        values = [float(explanation['match_breakdown'][cat].strip('%'))/100 for cat in categories]

        fig = go.Figure()

        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=categories,
            fill='toself',
            name=f'Match for {job_role}',
            line=dict(color='blue')
        ))

        fig.update_layout(
            polar=dict(
                radialaxis=dict(
                    visible=True,
                    range=[0, 1]
                )),
            showlegend=True,
            title=f"Match Analysis: {explanation['candidate']['name']} → {job_role}"
        )

        return fig

In [None]:
# Initialize explainable AI
explainer = ExplainableAI(job_matcher, predictor)

# Demonstrate explanation for a sample candidate
sample_candidate = 0
sample_job = 'AI Researcher'
explanation = explainer.explain_match(sample_candidate, sample_job)

print(" Sample Match Explanation:")
print(f"Candidate: {explanation['candidate']['name']}")
print(f"Job Role: {sample_job}")
print("\nMatch Breakdown:")
for metric, score in explanation['match_breakdown'].items():
    print(f"  {metric}: {score}")

print("\nRecommendations:")
for rec in explanation['recommendations']:
    print(f"  • {rec}")

# Create visualization
match_viz = explainer.create_match_visualization(sample_candidate, sample_job)
if match_viz:
    match_viz.show()


 Sample Match Explanation:
Candidate: Ashley Ali
Job Role: AI Researcher

Match Breakdown:
  Skills Match: 75.8%
  Experience Match: 100.0%
  Education Match: 25.0%
  Salary Compatibility: 100.0%
  Overall Score: 75.3%

Recommendations:
  • Consider pursuing advanced education or relevant certifications


INTERACTIVE DEMO

In [None]:
# Create a simple Gradio interface for testing
import gradio as gr

def demo_job_matching(job_role, top_k):
    """Demo function for Gradio interface"""
    if job_role not in ['AI Researcher', 'Data Scientist', 'Software Engineer', 'Cybersecurity Analyst']:
        return "Please select a valid job role."

    matches = job_matcher.find_best_matches(job_role, top_k=int(top_k))

    results = f"Top {top_k} candidates for {job_role}:\n\n"
    for i, match in enumerate(matches):
        results += f"{i+1}. {match['name']} (Score: {match['overall_score']:.3f})\n"
        results += f"   Skills: {match['skills'][:50]}...\n"
        results += f"   Experience: {match['experience']} years\n"
        results += f"   Education: {match['education']}\n\n"

    return results

In [None]:
# Create Gradio interface
demo = gr.Interface(
    fn=demo_job_matching,
    inputs=[
        gr.Dropdown(['AI Researcher', 'Data Scientist', 'Software Engineer', 'Cybersecurity Analyst'],
                   label="Job Role"),
        gr.Slider(1, 10, value=5, label="Number of candidates")
    ],
    outputs=gr.Textbox(label="Matching Results", lines=15),
    title="AI Resume Matching System Demo",
    description="Select a job role and see the best matching candidates from our database."
)

print("✅ Project setup complete!")
print("\n" + "="*60)
print("🎉 AI-POWERED RESUME MATCHING SYSTEM READY!")
print("="*60)

print("\nNext steps:")
print("1. Run the Gradio demo: demo.launch()")
print("2. Experiment with different job roles and parameters")
print("3. Extend with real job posting data")
print("4. Deploy to production environment")

✅ Project setup complete!

🎉 AI-POWERED RESUME MATCHING SYSTEM READY!

Next steps:
1. Run the Gradio demo: demo.launch()
2. Experiment with different job roles and parameters
3. Extend with real job posting data
4. Deploy to production environment


In [None]:
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e1d0803b0fc5c13514.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


