In [None]:
##################################################
### Importing necessary libraries ################
##################################################
import numpy as np
import pandas as pd
import nltk
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/resume_sorting/gpt_dataset.csv'

# Download NLTK data (required for tokenization, stop words, and lemmatization)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#########################
#### Load the dataset ###
#########################
df = pd.read_csv(file_path, encoding='utf-8')

# Display the first few rows to verify the data structure
print(df.head())

             Category                                             Resume
0  Frontend Developer  As a seasoned Frontend Developer, I have a pro...
1   Backend Developer  With a solid background in Backend Development...
2    Python Developer  As a Python Developer, I leverage my expertise...
3      Data Scientist  With a background in Data Science, I possess a...
4  Frontend Developer  Experienced Frontend Developer with a passion ...


In [None]:
###########################################
### Performing text preprocessing #########
###########################################
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join back into string
    return ' '.join(tokens)

# Apply preprocessing to the resume column
# Assuming the resume text is in 'Resume' column, adjust if necessary
df['cleaned_resume'] = df['Resume'].apply(preprocess_text)

In [None]:
###################################################
### Using TF-IDF for words vectorization ##########
###################################################
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit and transform the cleaned resumes
X = tfidf_vectorizer.fit_transform(df['cleaned_resume'])

# The target variable (categories) remains the same
y = df['Category']

# Optionally, print the shape of the resulting matrix to verify
print("Shape of TF-IDF matrix:", X.shape)

Shape of TF-IDF matrix: (400, 5000)


In [None]:
##################################################
### Splitting the data and training the model ####
##################################################
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)
#random_state ensures the reproducabilty which means, the same split is used for
#the future usage of the model
#X_train: The subset of the input features (X) used to train the model.
#X_test: The subset of the input features (X) used to evaluate the model's performance.
#y_train: The subset of the target labels (y) corresponding to X_train.
#y_test: The subset of the target labels (y) corresponding to X_test.

# Encode labels for XGBoost
#covert text categories into numbers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize and train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
print("Logistic Regression model trained.")

# Initialize and train Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
print("Random Forest model trained.")

# Initialize and train XGBoost model
xgb_clf = XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss')
xgb_clf.fit(X_train, y_train_encoded)
print("XGBoost model trained.")

Logistic Regression model trained.
Random Forest model trained.
XGBoost model trained.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_xgb = label_encoder.inverse_transform(xgb_clf.predict(X_test))

# Calculate and print accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Print detailed classification reports
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

Logistic Regression Accuracy: 1.0
Random Forest Accuracy: 1.0
XGBoost Accuracy: 1.0

Logistic Regression Classification Report:
                                    precision    recall  f1-score   support

                 Backend Developer       1.00      1.00      1.00        10
                    Cloud Engineer       1.00      1.00      1.00        17
                    Data Scientist       1.00      1.00      1.00        12
                Frontend Developer       1.00      1.00      1.00         9
              Full Stack Developer       1.00      1.00      1.00         7
         Machine Learning Engineer       1.00      1.00      1.00         7
Mobile App Developer (iOS/Android)       1.00      1.00      1.00        11
                  Python Developer       1.00      1.00      1.00         7

                          accuracy                           1.00        80
                         macro avg       1.00      1.00      1.00        80
                      weighted avg

In [None]:
import joblib
import pandas as pd

# Select the best model (Logistic Regression chosen for simplicity)
best_model = log_reg

# Save the model and TF-IDF vectorizer
joblib.dump(best_model, 'resume_classifier_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved.")

# Function to classify a new resume
def classify_resume(resume_text):
    # Preprocess the resume
    cleaned_resume = preprocess_text(resume_text)
    # Transform using the trained TF-IDF vectorizer
    resume_tfidf = tfidf_vectorizer.transform([cleaned_resume])
    # Predict the category
    prediction = best_model.predict(resume_tfidf)
    return prediction[0]

# Test with a sample resume
sample_resume = """
Skilled in Python, machine learning, and data analysis. Experienced in building predictive models using scikit-learn and TensorFlow. Proficient in SQL and big data tools like Hadoop.
"""
predicted_category = classify_resume(sample_resume)
print("Predicted job category for sample resume:", predicted_category)



Model and vectorizer saved.
Predicted job category for sample resume: Data Scientist


In [None]:
# Sample resume text for Data Engineer
cloud_engineer = """
John Doe has over 4 years of experience designing, implementing, and maintaining cloud infrastructure across AWS, Azure, and Google Cloud. He is skilled in Terraform, AWS CloudFormation, Docker, Kubernetes, Jenkins, and GitLab CI/CD, with strong programming skills in Python, Bash, and Go. John specializes in building scalable, secure cloud environments, setting up CI/CD pipelines, managing Kubernetes clusters (EKS), and implementing cloud security best practices like IAM policies and VPC configurations. He has successfully led cloud migration projects, automated deployments, and developed monitoring solutions using Prometheus, Grafana, and the ELK Stack. John holds a Bachelor’s degree in Computer Science from the University of Washington and is certified as an AWS Solutions Architect, Azure Administrator, and Kubernetes Administrator.
"""

# Classify the sample resume
predicted_category1 = classify_resume(cloud_engineer)
print("Predicted job category for Data Engineer resume:", predicted_category1)

Predicted job category for Data Engineer resume: Cloud Engineer


In [None]:
python_dev = """
John Doe has over 4 years of experience building and maintaining software applications using Python. He is skilled in frameworks like Django, Flask, and FastAPI, and has hands-on experience developing RESTful APIs, backend systems, and automation scripts. John is also proficient in working with SQL and NoSQL databases such as PostgreSQL, MySQL, and MongoDB. He has a strong background in writing efficient, clean, and scalable code, along with experience in deploying applications using Docker and AWS services. Additionally, he is familiar with Agile methodologies, Git version control, and testing tools like PyTest and Unittest. John holds a Bachelor's degree in Computer Science from the University of Washington and is passionate about developing high-performance software solutions that solve real-world problems.
"""
# Classify the sample resume
predicted_category2 = classify_resume(python_dev)
print("Predicted job category for Data Engineer resume:", predicted_category2)


Predicted job category for Data Engineer resume: Python Developer


Along with resume classification I want to classify the resume and rank the id or name of the candidates who are suitable for a role with the help of 5 top skills of the person in that role, and filter the top 10 persons in that category, or take input from the user to filter ‘x’ number of people for a certain role and sort them for futher interview or screening.

In [None]:
# Step 1: Download the resumes_sample.zip file
!wget https://github.com/florex/resume_corpus/raw/master/resume_samples.zip

# Step 2: Unzip the downloaded file
!unzip resume_samples.zip


--2025-04-26 05:31:45--  https://github.com/florex/resume_corpus/raw/master/resume_samples.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/florex/resume_corpus/master/resume_samples.zip [following]
--2025-04-26 05:31:46--  https://raw.githubusercontent.com/florex/resume_corpus/master/resume_samples.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63818300 (61M) [application/zip]
Saving to: ‘resume_samples.zip’


2025-04-26 05:31:48 (185 MB/s) - ‘resume_samples.zip’ saved [63818300/63818300]

Archive:  resume_samples.zip
  inflating: resume_samples.txt      


In [None]:
import pandas as pd

# Step 3: Read the extracted file
data = []
with open('resume_samples.txt', 'r', encoding='utf-8', errors='replace') as file:
    for line in file:
        parts = line.strip().split(':::')
        if len(parts) == 3:
            id_, occupations, resume_text = parts
            data.append((id_, occupations, resume_text))

# Step 4: Create a DataFrame
new_df = pd.DataFrame(data, columns=['ID', 'Occupations', 'resume_text'])

# Step 5: View the DataFrame
new_df.head()


Unnamed: 0,Name,Occupations,Resume
0,C:\Workspace\java\scrape_indeed\dba_part_1\1.h...,Database Administrator;Database Administrator;...,"Database Administrator <span class=""hl"">Databa..."
1,C:\Workspace\java\scrape_indeed\dba_part_1\10....,"Database Administrator;SQL, Microsoft PowerPoi...","Database Administrator <span class=""hl"">Databa..."
2,C:\Workspace\java\scrape_indeed\dba_part_1\100...,Oracle Database Administrator;Oracle Database ...,Oracle Database Administrator Oracle <span cla...
3,C:\Workspace\java\scrape_indeed\dba_part_1\100...,Amazon Redshift Administrator and ETL Develope...,Amazon Redshift Administrator and ETL Develope...
4,C:\Workspace\java\scrape_indeed\dba_part_1\100...,Scrum Master;Oracle Database Administrator/ Sc...,Scrum Master Scrum Master Scrum Master Richmon...


In [None]:
import pandas as pd
import joblib
from collections import defaultdict
import numpy as np

# Load saved model and vectorizer
model = joblib.load('resume_classifier_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load new dataset (assumed to have 'Name' and 'Resume' columns)
#new_df = pd.read_csv('new_resume_dataset.csv')  # Update path as needed

# Preprocess resumes
new_df['cleaned_resume'] = new_df['Resume'].apply(preprocess_text)

# Transform resumes to TF-IDF vectors
X_new = vectorizer.transform(new_df['cleaned_resume'])

# Predict job roles
new_df['Predicted_Role'] = model.predict(X_new)

# Count applicants per role
role_counts = new_df['Predicted_Role'].value_counts()
print("\nNumber of Applicants by Job Role:")
for role, count in role_counts.items():
    print(f"{role}: {count}")

# Define key skills for each role (based on your dataset)
key_skills = {
    'Data Scientist': ['python', 'machine learning', 'statistics', 'sql', 'tensorflow'],
    'Cloud Engineer': ['aws', 'azure', 'docker', 'kubernetes', 'cloud'],
    'Backend Developer': ['java', 'node.js', 'rest api', 'mongodb', 'spring'],
    'Frontend Developer': ['javascript', 'react', 'css', 'html', 'typescript'],
    'Full Stack Developer': ['javascript', 'python', 'django', 'react', 'sql'],
    'Machine Learning Engineer': ['python', 'deep learning', 'pytorch', 'tensorflow', 'sklearn'],
    'Mobile App Developer (iOS/Android)': ['swift', 'kotlin', 'flutter', 'android', 'ios'],
    'Python Developer': ['python', 'django', 'flask', 'pandas', 'numpy']
    # Add 'Data Engineer' if in dataset, else map to similar role
}

# Rank candidates by skills for each role
top_candidates = defaultdict(list)
feature_names = vectorizer.get_feature_names_out()

for role in key_skills:
    # Filter candidates for the role
    role_df = new_df[new_df['Predicted_Role'] == role]
    if role_df.empty:
        print(f"\nNo applicants for {role}")
        continue

    # Get indices of key skills in TF-IDF matrix
    skill_indices = [np.where(feature_names == skill)[0][0] for skill in key_skills[role] if skill in feature_names]

    # Calculate skill scores (sum of TF-IDF values for key skills)
    role_X = vectorizer.transform(role_df['cleaned_resume'])
    skill_scores = np.sum(role_X[:, skill_indices].toarray(), axis=1)

    # Rank candidates
    ranked_indices = np.argsort(skill_scores)[::-1][:10]  # Top 10
    for idx in ranked_indices:
        name = role_df.iloc[idx]['Name']
        score = skill_scores[idx]
        top_candidates[role].append((name, score))

    # Print top 10
    print(f"\nTop 10 Candidates for {role}:")
    for name, score in top_candidates[role]:
        print(f"{name}: Skill Score = {score:.4f}")

KeyboardInterrupt: 