# Project :Automated Resume Screening
Created By Group_19

# Step 1: Data Collection and View

In [1]:
import pandas as pd
df = pd.read_csv('Final_Resumes.csv')
df.head(10)

Unnamed: 0,Category,Resume
0,Product Manager,"Skills: Market Analysis, Stakeholder Managemen..."
1,IT Support Specialist,"Skills: Troubleshooting, Active Directory, Tic..."
2,Full Stack Developer,"Skills: Node.js, SQL, JavaScript, HTML/CSS. Ex..."
3,Full Stack Developer,"Skills: HTML/CSS, JavaScript, React, Node.js. ..."
4,Network Engineer,"Skills: Network Security, Routing and Switchin..."
5,Full Stack Developer,"Skills: JavaScript, SQL, React, Node.js. Exper..."
6,DevOps Engineer,"Skills: AWS, CI/CD, Jenkins, Docker. Experienc..."
7,System Administrator,"Skills: Linux, Networking, Scripting, Windows...."
8,Product Manager,"Skills: Agile, Product Roadmap, Market Analysi..."
9,Back End Developer,"Skills: API Development, Database Management, ..."


In [None]:
# To check null values on dataset is  present or not
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  500 non-null    object
 1   Resume    500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


# Step 2: Data Preprocessing

In [2]:
import pandas as pd

# Load the dataset (assumed to be a CSV)
data = pd.read_csv('Final_Resumes.csv')

# Display the first few rows of the dataset to understand its structure
print(data.head(10))


                Category                                             Resume
0        Product Manager  Skills: Market Analysis, Stakeholder Managemen...
1  IT Support Specialist  Skills: Troubleshooting, Active Directory, Tic...
2   Full Stack Developer  Skills: Node.js, SQL, JavaScript, HTML/CSS. Ex...
3   Full Stack Developer  Skills: HTML/CSS, JavaScript, React, Node.js. ...
4       Network Engineer  Skills: Network Security, Routing and Switchin...
5   Full Stack Developer  Skills: JavaScript, SQL, React, Node.js. Exper...
6        DevOps Engineer  Skills: AWS, CI/CD, Jenkins, Docker. Experienc...
7   System Administrator  Skills: Linux, Networking, Scripting, Windows....
8        Product Manager  Skills: Agile, Product Roadmap, Market Analysi...
9     Back End Developer  Skills: API Development, Database Management, ...


In [7]:
# Check for missing values
print(data.isnull().sum())

# Optionally, drop rows with missing data
data.dropna(inplace=True)

Category    0
Resume      0
dtype: int64


In [None]:
import spacy
import re

# Load the English language model in SpaCy
nlp = spacy.load('en_core_web_sm')

# Function to clean and preprocess text
def clean_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text and remove stop words
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token) > 2]
    
    # Join the tokens back into a single string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply text cleaning to both the 'Resume' and 'Job Description' columns
data['Resume_cleaned'] = data['Resume'].apply(clean_text)
data['Job_Description_cleaned'] = data['Category'].apply(clean_text)

# Display cleaned data
print(data[['Resume_cleaned', 'Job_Description_cleaned']].head())

                                      Resume_cleaned Job_Description_cleaned
0  skill market analysis stakeholder management p...         product manager
1  skill troubleshoot active directory ticketing ...      support specialist
2  skill nodejs sql javascript htmlcss experience...         stack developer
3  skill htmlcss javascript react nodejs experien...         stack developer
4  skill network security routing switch tcpip ci...        network engineer


In [7]:
# Combine cleaned 'Resume' and 'Job Description' into a single text field
data['text_combined'] = data['Resume_cleaned'] + ' ' + data['Job_Description_cleaned']

# Display the combined text
print(data['text_combined'].head(10))

0    skill market analysis stakeholder management p...
1    skill troubleshoot active directory ticketing ...
2    skill nodejs sql javascript htmlcss experience...
3    skill htmlcss javascript react nodejs experien...
4    skill network security routing switch tcpip ci...
5    skill javascript sql react nodejs experience w...
6    skill aws cicd jenkins docker experience work ...
7    skill linux network script window experience w...
8    skill agile product roadmap market analysis st...
9    skill api development database management pyth...
Name: text_combined, dtype: object


#### 2.5 Convert Text into Numerical Features (Vectorization)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to the top 5000 features

# Fit and transform the combined text data
X = tfidf_vectorizer.fit_transform(data['text_combined'])

# Display the shape of the resulting feature matrix
print(X.shape)

(500, 236)


#### 2.6 Target Variable Preparation

In [3]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Convert job categories to numerical labels
y = label_encoder.fit_transform(data['Category'])

# Display the encoded labels
print(y[:20])

[13 11 10 10 12 10  7 18 13  1  7 11 10 10  9 16 18  4 11  5]


In [10]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shape of the split data
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(350, 236) (150, 236) (350,) (150,)


In [11]:
# Save the vectorizer and label encoder for later use
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

# 3. Model Selection and Training

#### 3.1: Model Selection

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [13]:
# Load the cleaned resume data
df = pd.read_csv("cleaned_resumes.csv")

In [14]:
# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on test data
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.00    

In [15]:
# Initialize Support Vector Classifier
svm = SVC()

# Train the model
svm.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm.predict(X_test)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         6
         

In [16]:
# Initialize Random Forest model
rf = RandomForestClassifier()

# Train the model
rf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         6

In [17]:
# Initialize Naive Bayes model
nb = MultinomialNB()

# Train the model
nb.fit(X_train, y_train)

# Predict on test data
y_pred_nb = nb.predict(X_test)

# Evaluate the model
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         6
 

In [18]:
# Initialize K-Nearest Neighbors model
knn = KNeighborsClassifier()

# Train the model
knn.fit(X_train, y_train)

# Predict on test data
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         6
         

#### 3.4 Model Evaluation and Selection

In [19]:
# Summary of model performance
models = {
    "Logistic Regression": log_reg,
    "SVM": svm,
    "Random Forest": rf,
    "Naive Bayes": nb,
    "KNN": knn
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Model: Logistic Regression
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00        13
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         8
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00         6
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1

In [20]:
import joblib

# Save the best model (Random Forest in this case)
joblib.dump(rf, 'resume_screening_model.pkl')

['resume_screening_model.pkl']

#### Step 5: Enhancing with Resume Accuracy & Skill Suggestions


#### 5.1.1 Code for Resume Accuracy (Cosine Similarity)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_resume_accuracy(resume_text, job_description, vectorizer):
    # Vectorize the resume and job description
    resume_vector = vectorizer.transform([resume_text])
    job_description_vector = vectorizer.transform([job_description])

    # Compute cosine similarity
    similarity_score = cosine_similarity(resume_vector, job_description_vector)[0][0]

    # Return the accuracy score as a percentage
    return round(similarity_score * 100, 2)


#### 5.2 Skill Suggestions

In [22]:
# Example list of skills relevant to the job description
job_skills = ['Python', 'Machine Learning', 'Data Analysis', 'SQL', 'Deep Learning', 'NLP']

# Extract skills from resume (this can be improved with NER or pattern matching)
def extract_skills_from_resume(resume_text):
    # List of example skills from the resume
    resume_skills = ['Python', 'SQL', 'Data Analysis']  # This should be extracted from resume_text in a real system
    return resume_skills

# Function to suggest missing skills
def suggest_skills(resume_skills, job_skills):
    # Identify missing skills
    missing_skills = [skill for skill in job_skills if skill not in resume_skills]
    return missing_skills

# Example usage
resume_text = "Experienced in Python and SQL for data analysis."
resume_skills = extract_skills_from_resume(resume_text)
missing_skills = suggest_skills(resume_skills, job_skills)

print("Missing Skills:", missing_skills)


Missing Skills: ['Machine Learning', 'Deep Learning', 'NLP']


In [23]:
# Define skills for each job category
skills_dict = {
    'Data Scientist': ['Python', 'R', 'Machine Learning', 'Data Analysis'],
    'Software Engineer': ['Java', 'C++', 'Algorithms', 'Data Structures'],
    # Add more job categories and skills
}

# Suggest missing skills
def suggest_missing_skills(job_category, resume_text):
    required_skills = skills_dict.get(job_category, [])
    suggested_skills = [skill for skill in required_skills if skill not in resume_text]
    return suggested_skills


In [24]:
# Function to predict job category, accuracy, and suggest skills
def screen_resume(resume_text, job_desc):
    # Combine resume text with job description
    text_combined = resume_text + ' ' + job_desc
    
    # Convert text to numerical form using saved TF-IDF vectorizer
    text_vector = tfidf_vectorizer.transform([text_combined])
    
    # Predict job category using the loaded model
    predicted_job_category = rf_model.predict(text_vector)[0]
    
    # Get resume accuracy
    accuracy_score = get_resume_accuracy()
    
    # Suggest missing skills
    suggested_skills = suggest_missing_skills(predicted_job_category, resume_text)
    
    return predicted_job_category, accuracy_score, suggested_skills
