In [1]:
# 1. Import libraries
import pandas as pd

# 2. Load CSV
df = pd.read_csv('../data/Resume.csv')  # Adjust path if needed

# 3. Basic info
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# 4. View sample data
df.head()

Shape: (2484, 4)

Columns: ['ID', 'Resume_str', 'Resume_html', 'Category']


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [2]:
print("\nNumber of categories:", df['Category'].nunique())
print("\nCategory counts:\n", df['Category'].value_counts())


Number of categories: 24

Category counts:
 Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # for lemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tushar04master/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tushar04master/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tushar04master/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_resume(text):
    # Convert to string if not already
    text = str(text)
    
    # Lowercase
    text = text.lower()
    
    # Remove numbers, punctuations, special chars
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [6]:
df['cleaned_resume'] = df['Resume_str'].apply(clean_resume)
df[['Category', 'cleaned_resume']].head()

Unnamed: 0,Category,cleaned_resume
0,HR,hr administrator marketing associate hr admini...
1,HR,hr specialist u hr operation summary versatile...
2,HR,hr director summary year experience recruiting...
3,HR,hr specialist summary dedicated driven dynamic...
4,HR,hr manager skill highlight hr skill hr departm...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


# FEATURE EXTRACTION AND ENCODE LABELS

In [8]:
tfidf=TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned_resume'])
print("tfidf shape: ", X.shape)

tfidf shape:  (2484, 3000)


In [9]:
# Encoding target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Category'])

# Saving LabelEncoder for future decoding
import joblib
joblib.dump(label_encoder, '../models/label_encoder.pkl')

# View mapping
print("Encoded categories:", list(label_encoder.classes_))

Encoded categories: ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


# Training resume-classifier with Naive Bayes model

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score , confusion_matrix

In [11]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42 , stratify = y)
#stratify=y ensures balanced class distribution in train/test.

In [12]:
model = MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_ , zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5593561368209256

Classification Report:

                        precision    recall  f1-score   support

            ACCOUNTANT       0.51      0.88      0.65        24
              ADVOCATE       0.37      0.42      0.39        24
           AGRICULTURE       1.00      0.08      0.14        13
               APPAREL       0.50      0.05      0.10        19
                  ARTS       0.75      0.14      0.24        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.83      0.62      0.71        24
               BANKING       0.81      0.57      0.67        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.38      0.79      0.51        24
                  CHEF       0.81      0.71      0.76        24
          CONSTRUCTION       0.68      0.77      0.72        22
            CONSULTANT       1.00      0.09      0.16        23
              DESIGNER       0.71      0.57      

# Saving the model

In [55]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [14]:
import joblib

# Create 'models' folder if not exists
import os
os.makedirs('models', exist_ok=True)

# Save TF-IDF vectorizer and model
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
joblib.dump(model, '../models/nb_model.pkl')

['../models/nb_model.pkl']

# Testing the model 

In [32]:
import joblib

# Load vectorizer and model
vectorizer = joblib.load('../models/tfidf_vectorizer.pkl')
model = joblib.load('../models/nb_model.pkl')
label_encoder = joblib.load('../models/label_encoder.pkl')

# Predict
resume_text = "Strategic consultant offering analytical insights and solutions across diverse challenges."
vector = vectorizer.transform([resume_text])
prediction = model.predict(vector)

# Using LabelEncoder to decode this predicted category

# Decode the predicted label
decoded_category = label_encoder.inverse_transform(prediction)[0]
print("Predicted Category (decoded):", decoded_category)

Predicted Category (decoded): CONSULTANT
