In [175]:
import pandas as pd 
import numpy as np
import spacy 

In [176]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text 
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue 
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [177]:
resumes = pd.read_csv("Resume.csv")

In [178]:
resumes.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [179]:
resumes.drop(columns=["ID","Resume_html"])

Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR
...,...,...
2479,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,AVIATION
2480,"GOVERNMENT RELATIONS, COMMUNICATIONS ...",AVIATION
2481,GEEK SQUAD AGENT Professional...,AVIATION
2482,PROGRAM DIRECTOR / OFFICE MANAGER ...,AVIATION


1) Cleaning using spacy

In [180]:
# resumes['resume_cleaned'] = resumes['Resume_str'].apply(lambda x : preprocess(x))
# resumes.head()  

In [181]:
# resumes['resume_cleaned'] = resumes['Resume_str'].str.lower()

In [182]:
resumes.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [183]:
#the value doesnt change 
# print(len(resumes['resume_cleaned'][755]))
# print(len(resumes['Resume_str'][755])) 

In [184]:
one_hot_encoded_category = pd.get_dummies(resumes['Category'])
one_hot_encoded_category  

Unnamed: 0,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2480,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2481,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2482,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [185]:
resumes = pd.merge(resumes, one_hot_encoded_category, left_index=True, right_index=True)
resumes.head()    

Unnamed: 0,ID,Resume_str,Resume_html,Category,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


2) Cleaning using NLTK

In [186]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [187]:
def basic_preprocessing(text):
  text=text.lower()
  text=re.sub(r'[^\w\s]','',text)
  text = re.sub(r'@\w+', '', text)
  return text

In [188]:
resumes['resume_cleaned2'] = resumes['Resume_str'].apply(basic_preprocessing)
resumes['resume_cleaned2'] = resumes['resume_cleaned2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [189]:
# from nltk.tokenize import word_tokenize
# resumes['resume_cleaned2'] = resumes['resume_cleaned2'].apply(lambda x: word_tokenize(x))

In [190]:
#there is change
print(len(resumes["Resume_str"][1]))
print(len(resumes["resume_cleaned2"][0]))

5572
4353


In [191]:
resumes.Category.value_counts()  

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

Model training


In [192]:
resumes.columns

Index(['ID', 'Resume_str', 'Resume_html', 'Category', 'ACCOUNTANT', 'ADVOCATE',
       'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING',
       'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT',
       'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS',
       'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS',
       'SALES', 'TEACHER', 'resume_cleaned2'],
      dtype='object')

In [193]:
# from sklearn.model_selection import train_test_split

# X = resumes.resume_cleaned
# y = resumes.drop(columns=['ID', 'Resume_str', 'Resume_html', 'Category', 'resume_cleaned',
#        'resume_cleaned2'])

# X_train, X_test, y_train, y_test = train_test_split(
#     X,  
#     y,
#     test_size=0.2, # 20% samples will go to test dataset 
#     random_state=2022, 
#     stratify=resumes.Category
# )

Model training using RandomForest

In [194]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline 
# from sklearn.metrics import classification_report 

# #1. create a pipeline object
# clf = Pipeline([ 
#      ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
#      ('Random Forest', RandomForestClassifier())         
# ])

# #2. fit with X_train and y_train
# clf.fit(X_train, y_train)

# #3. get the predictions for X_test and store it in y_pred
# y_pred = clf.predict(X_test)


# #4. print the classfication report
# print(classification_report(y_test, y_pred))

Using resume_cleaned2

In [195]:
tfidf_params = {
    'sublinear_tf': True,  # Use a logarithmic form for frequency
    'min_df': 5,  # Minimum numbers of documents a word must be present in to be kept
    'norm': 'l2',  # Ensure all feature vectors have a Euclidean norm of 1
    'ngram_range': (1, 2),  # Consider both uni-grams and bigrams
    'stop_words': 'english'  # Remove common English stop words
}

In [196]:
from sklearn.model_selection import train_test_split

X = resumes.resume_cleaned2
y = resumes.drop(columns=['ID', 'Resume_str', 'Resume_html', 'Category',
       'resume_cleaned2'])

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, 
    y,
    test_size=0.2, # 20% samples will go to test dataset 
    random_state=2022, 
    stratify=y
)

In [197]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report 

#1. create a pipeline object
clf = Pipeline([ 
     ('vectorizer_tfidf',TfidfVectorizer(**tfidf_params)),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train2, y_train2)

#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test2)

#4. print the classfication report
print(classification_report(y_test2, y_pred)) 

              precision    recall  f1-score   support

           0       0.83      0.42      0.56        24
           1       1.00      0.04      0.08        24
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        21
           5       0.00      0.00      0.00         7
           6       1.00      0.26      0.41        23
           7       0.00      0.00      0.00        23
           8       0.00      0.00      0.00         4
           9       1.00      0.04      0.08        24
          10       1.00      0.75      0.86        24
          11       1.00      0.32      0.48        22
          12       0.00      0.00      0.00        23
          13       0.83      0.24      0.37        21
          14       0.00      0.00      0.00        19
          15       1.00      0.21      0.34        24
          16       0.00      0.00      0.00        24
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [198]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# Create a pipeline with TF-IDF vectorizer and Logistic Regression
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer(**tfidf_params)),
    ('Logistic Regression', MultiOutputClassifier(LogisticRegression()))
])

# Fit the pipeline with X_train2 and y_train2
clf.fit(X_train2, y_train2)

# Get the predictions for X_test2 and store them in y_pred
y_pred = clf.predict(X_test2)

# Print the classification report
print(classification_report(y_test2, y_pred))


              precision    recall  f1-score   support

           0       0.80      0.17      0.28        24
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        21
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00        23
           7       0.00      0.00      0.00        23
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00        24
          10       1.00      0.46      0.63        24
          11       0.00      0.00      0.00        22
          12       0.00      0.00      0.00        23
          13       1.00      0.05      0.09        21
          14       0.00      0.00      0.00        19
          15       0.00      0.00      0.00        24
          16       0.00      0.00      0.00        24
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [199]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Create a pipeline with TF-IDF vectorizer and Linear SVM classifier
clf = Pipeline([
    ('vectorizer_tfidf', TfidfVectorizer(**tfidf_params)),
    ('Linear SVM', MultiOutputClassifier(SVC(kernel='linear')))
])

# Fit the pipeline with X_train2 and y_train2
clf.fit(X_train2, y_train2)

# Get the predictions for X_test2
y_pred_svm = clf.predict(X_test2)

# Generate the classification report
report = classification_report(y_test2, y_pred_svm)
print(report)


              precision    recall  f1-score   support

           0       0.77      0.71      0.74        24
           1       1.00      0.08      0.15        24
           2       0.00      0.00      0.00        13
           3       1.00      0.05      0.10        19
           4       0.00      0.00      0.00        21
           5       0.00      0.00      0.00         7
           6       1.00      0.48      0.65        23
           7       0.60      0.13      0.21        23
           8       0.00      0.00      0.00         4
           9       0.80      0.33      0.47        24
          10       1.00      0.83      0.91        24
          11       0.93      0.59      0.72        22
          12       0.00      0.00      0.00        23
          13       0.81      0.62      0.70        21
          14       1.00      0.16      0.27        19
          15       0.77      0.42      0.54        24
          16       0.80      0.17      0.28        24
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [200]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test2, y_pred_svm)
accuracy 

0.32796780684104626