In [1]:
# Importing Libraries
import pandas as pd
import os
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
kfold = KFold(5)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Preparing Data

In [2]:
Datadirectory = 'Resumes2'
classes = ['Peoplesoft resumes','React resumes','SQL Developer Lightning insight','workday resumes']

In [3]:
label = []
key = []
def create_training_data():
    for category in classes:
        path = os.path.join(Datadirectory,category)
        class_num = classes.index(category)
        for resume in os.listdir(path):
            label.append(category)
            key.append(resume)            

create_training_data()        

In [4]:
print('Labels: \n{}\nLength of Labels: {}\nKeys: \n{}\nLength of Keys: {}'.format(label[:5], len(label), key[:5], len(key)))

Labels: 
['Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes', 'Peoplesoft resumes']
Length of Labels: 78
Keys: 
['Peoplesoft Admin_AnubhavSingh.docx', 'Peoplesoft Admin_G Ananda Rayudu.doc', 'Peoplesoft Admin_Gangareddy.doc', 'Peoplesoft Admin_Murali.docx', 'Peoplesoft Admin_Priyanka Ramadoss.doc']
Length of Keys: 78


In [5]:
# Creating a Dictionary of Key and Label
labelDict = dict(zip(key, label))

# Sorting it in alphabetical order of keys
finalDict = {}
for i in sorted(labelDict.keys()):
    finalDict[i] = labelDict[i]

In [6]:
# Importing Details dataframe
df = pd.read_csv('FinalDF.csv')
df = df.iloc[:,1:]
df.head()

Unnamed: 0,Name,Number,Email ID,Links,Education,Skills,University Name,Years of Experience
0,ANIL KUMAR,['+911234567890'],ijayawadaabc@xyz.com,"['https://www.linkedin.com/fake', 'https://www...","[('MS', '2016'), 'BTech']","['Sql', 'Debugging', 'Technical', 'Windows', '...",['Velagapudi siddhartha engineering college'],2.4
1,Aradhana Tripathi,['+911234567890'],niversity.abc@xyz.com,"['https://www.linkedin.com/fake', 'https://www...","['MS', 'MCA']","['Sql', 'Acquisition', 'Stakeholder management...","['AKS University', ' IIIT ']",4
2,Database Engineer,['+911234567890'],knowledge.abc@xyz.com,"['https://www.linkedin.com/fake', 'https://www...","['MS', ('SSC', '2011')]","['Sql', 'Servers', 'Technical', 'Programming',...",['Create ETL Jobs And Monitoring The Jobs ACAD...,
3,Chinna Subbarayudu,['+911234567890'],2abc@xyz.com,"['https://www.linkedin.com/fake', 'https://www...",['MS'],"['Policies', 'Sql', 'Web services', 'Prototypi...",['Yogi Vemana University'],5.1
4,Gopi Krishna,['+911234567890'],annada.abc@xyz.com,"['https://www.linkedin.com/fake', 'https://www...",['Bachelor of Degree from JNTU - K University ...,"['Specifications', 'Technical', 'Communication...",['JNTU'],3+


In [7]:
df.isnull().sum()

Name                    0
Number                  0
Email ID                0
Links                   0
Education              11
Skills                  0
University Name         9
Years of Experience     9
dtype: int64

In [8]:
import pickle

In [10]:
pickle.dump(df, open('model.pkl','wb'))

In [11]:
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))

In [8]:
# Creating a Dataframe for Modelling
model_df = pd.DataFrame(df['Skills'])

In [9]:
model_df.head()

Unnamed: 0,Skills
0,"['Sql', 'Debugging', 'Technical', 'Windows', '..."
1,"['Sql', 'Acquisition', 'Stakeholder management..."
2,"['Sql', 'Servers', 'Technical', 'Programming',..."
3,"['Policies', 'Sql', 'Web services', 'Prototypi..."
4,"['Specifications', 'Technical', 'Communication..."


In [None]:
# Creating a Dataframe for Modelling
model_df = pd.DataFrame(df['Name'])
model_df['Skills'] = df['Skills']
model_df['Label'] = labelDict.values()
model_df.head(10)

In [None]:
model_df[50:60]

In [None]:
# Storing it in a csv file
# model_df.to_csv('ResumeClassificationDF.csv')

In [None]:
model_df = pd.read_csv('ResumeClassificationDF.csv')
model_df = model_df.iloc[:,1:]

In [None]:
model_df.isnull().sum()

# Data Pre-Processing

## Data Cleaning

In [None]:
lemmetizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

cleaned_data = []
def clean_data(text):
    text_clean = []
    text_tokens = word_tokenize(text)
    for word in text_tokens:
        if (word not in stop_words and # remove stopwords
            word not in string.punctuation): # remove punctuation
            stem_word = lemmetizer.lemmatize(word) # stemming word
            text_clean.append(stem_word)
    list_to_str = ' '.join([str(ele) for ele in text_clean])
    list_to_str = re.sub("'",'',list_to_str)   
    return list_to_str.lower() 

 # Calling Function
for text in model_df['Skills']:
    cleaned_data.append(clean_data(text))

In [None]:
model_df['Cleaned Skills'] = cleaned_data
model_df.head()

## Vectorization and Label Encoding

In [None]:
CV = CountVectorizer(stop_words = 'english')
le = LabelEncoder()

In [None]:
# Label Encoder
model_df['Label'] = le.fit_transform(model_df['Label'])


# Bag of Words Vectorization
# model_df['Vectorized Skills'] = CV.fit_transform(model_df['Cleaned Skills'])

In [None]:
model_df.head(5)

## Splitting into Training and Testing

In [None]:
X = model_df['Cleaned Skills'].values
y = model_df['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.20, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Model Training

In [None]:
mnb = MultinomialNB()
X_train_CV = CV.fit_transform(X_train)
mnb.fit(X_train_CV, y_train)
X_test_CV = CV.transform(X_test)
y_pred = mnb.predict(X_test_CV)

In [None]:
accuracy_score(y_pred, y_test)*100

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_CV, y_train)
y_pred = clf_rf.predict(X_test_CV)
accuracy_score(y_pred, y_test)*100

# Model Evaluation

In [None]:
# Define a function to compute Precision, Recall and F1 score
from sklearn.metrics import roc_auc_score, precision_score, recall_score
precision, recall, f1, models, accuracy = [], [],[], [], []
def get_pre_rec_f1(model_name, model,X_test,y_test):
    models.append(model_name)
    y_pred = model.predict(X_test)
    precision_Score = precision_score(y_test, y_pred, average = 'weighted')
    recall_Score = recall_score(y_test, y_pred, average = 'weighted')
    F1 = 2 * (precision_Score * recall_Score) / (precision_Score + recall_Score)
    accuracy_Score = accuracy_score(y_test, y_pred)
    precision.append(precision_Score)
    recall.append(recall_Score)
    f1.append(F1)
    accuracy.append(accuracy_Score)
    df = pd.DataFrame(models,columns=['Model'] )
    df['Accuracy'] = accuracy
    df['Precision'] = precision
    df['Recall'] = recall
    df['F1 Score'] = f1
    return df
    # print(f'Precision:{precision:.3f}\nRecall:{recall:.3f}\nF1 score:{F1:.3f}')

In [None]:
eval_df = get_pre_rec_f1('Multinomial Naive Bayes', mnb, X_test_CV, y_test)
eval_df = get_pre_rec_f1('Random Forest Classifier', clf_rf, X_test_CV, y_test)
eval_df

In [13]:
import pickle

In [17]:
pickle.dump(model_df, open('model.pkl_n','wb'))

In [19]:
# Loading model to compare the results
model = pickle.load(open('model_n.pkl','rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'model_n.pkl'