In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('../data/Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [3]:
df.shape

(2484, 4)

In [4]:
df['Category'].value_counts()

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

This downloads the WordNet dictionary.

WordNet is required for:

*Lemmatization

*Understanding base word forms

Without this, lemmatizer will not work properly.

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords') # is ,am 
nltk.download('wordnet')   

lemmatizer=WordNetLemmatizer()  # eg studies -> study
stopwords=set(stopwords.words('english'))

def clean_text(text):
    text=text.lower()
    text=re.sub(r'http\S+','',text)  # remove urls
    text = re.sub(r'<.*?>', ' ', text)   # remove html tags
    text=re.sub(r'www\S+','',text)   # remove urls
    text=re.sub(r'[^a-zA-Z]',' ',text)  # remove special characters and numbers
    words=text.split()
    words=[lemmatizer.lemmatize(w) for w in words if w not in stopwords and len(w)>2]
    return ' '.join(words)

df['Clean_Resume']=df['Resume_str'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\hp\OneDrive\Desktop\mega
[nltk_data]     projects\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     c:\Users\hp\OneDrive\Desktop\mega
[nltk_data]     projects\venv\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=10000,ngram_range=(1,3),min_df=5,max_df=0.8,sublinear_tf=True)
X = vectorizer.fit_transform(df['Clean_Resume'])
y = df['Category']

In [7]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)



In [8]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))   # accuracy increased from 63.6 to 69.8 with svm instead of using logistic regression then to 71 by tuning tfidf parameters

Accuracy: 0.7142857142857143
                        precision    recall  f1-score   support

            ACCOUNTANT       0.84      0.93      0.89        29
              ADVOCATE       0.69      0.73      0.71        30
           AGRICULTURE       0.40      0.25      0.31         8
               APPAREL       0.53      0.45      0.49        20
                  ARTS       0.50      0.22      0.31        18
            AUTOMOBILE       0.67      0.33      0.44         6
              AVIATION       0.78      0.86      0.82        21
               BANKING       0.71      0.74      0.72        23
                   BPO       1.00      0.50      0.67         2
  BUSINESS-DEVELOPMENT       0.92      0.81      0.86        27
                  CHEF       0.85      0.71      0.77        24
          CONSTRUCTION       0.85      0.85      0.85        34
            CONSULTANT       0.60      0.45      0.51        20
              DESIGNER       0.86      0.95      0.90        19
         D

In [9]:
import joblib

joblib.dump(model, "../model/resume_model.pkl")
joblib.dump(vectorizer, "../model/tfidf.pkl")

['../model/tfidf.pkl']