### Import Libraires

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import re
%matplotlib inline

### Import Dataset

In [2]:
#reading files
resumeData = pd.read_csv(r'./ResumeData/Resume.csv', index_col = 0 )

print(resumeData.columns)
resumeData.shape

FileNotFoundError: [Errno 2] File ./Resume.csv does not exist: './Resume.csv'

### Data visualization

In [None]:
#Histogram
plt.xticks(rotation = 90)
plt.xlim([0, 23])
plt.hist(resumeData['Category'])
plt.ylabel('Frequency count')
plt.xlabel('Data');
plt.title('My histogram')
plt.show()

### Dropping Rows/Columns

In [None]:
#Drop null rows
resumeData = resumeData.dropna()

columnDrop = ['Resume_html']

resumeData.drop(columnDrop, axis=1, inplace=True)

resumeData


### Combing Job Categories

In [None]:
resumeData['Category'] = resumeData['Category'].apply(lambda x: re.sub(r'[\W\s_-]', '', x.upper()))

print(resumeData.Category.value_counts())

#Combining Categories to reduce number of classes
resumeData['Category'] = resumeData['Category'].replace(['FINANCE','BANKING', 'ACCOUNTING','ACCOUNTANT'],'FINANCE/BANKING/ACCOUNTING')
resumeData['Category'] = resumeData['Category'].replace(['ARTS','DESIGNER', 'APPAREL','FITNESS', 'CHEF'],'ARTS/FITNESS')
resumeData['Category'] = resumeData['Category'].replace(['BUSINESSDEVELOPMENT','HR', 'BPO'],'BUSINESS/HR')
resumeData['Category'] = resumeData['Category'].replace(['AUTOMOBILE','ENGINEERING', 'CONSTRUCTION', 'AVIATION', 'AGRICULTURE'],'ENGINEERING')
resumeData['Category'] = resumeData['Category'].replace(['PUBLICRELATIONS','DIGITALMEDIA', 'SALES'],'PUBLICRELATIONS/DIGITALMEDIA/SALES')
resumeData['Category'] = resumeData['Category'].replace(['CONSULTANT','ADVOCATE'],'CONSULTANT/ADVOCATE')


#New Histogram
plt.xticks(rotation = 90)
plt.xlim([0, 12])
plt.hist(resumeData['Category'],
         rwidth = 0.5)
plt.ylabel('Frequency count')
plt.xlabel('Data');
plt.title('My histogram')
plt.show()

resumeData.Category.value_counts()

### Stemmer

In [None]:
nltk.download('stopwords')

#initialize objs
stemmer = SnowballStemmer("english")
stop = stopwords.words('english')

def stemStopResume(df):
    #remove excess symbols
    df = df.replace(r'[^\w\s]|_', '', regex=True)
    
    #remove stopwords
    df['stopColumn'] = df['Resume_str'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    #split and stem column
    df['stopColumn'] = df['stopColumn'].str.split()
    df['stopColumn'] = df['stopColumn'].apply(lambda x: ' '.join([stemmer.stem(y) for y in x]))
    
    #return modified df
    return df

resumeData = stemStopResume(resumeData)
resumeData

### Vectorization

In [None]:
import pickle

def resume_vectorizer(data,labels ,m = 1.0 , n = 2) :
  
    # vectorizing the lables i.e To numerical values using label_encoder
    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    print(labels.classes_)
  
    # Now vectorizing the data i.e. Creating features based on data
    cv = CountVectorizer(max_df = m , min_df = n )
    data_cv = cv.fit_transform(data)
    data_cv_array = data_cv.toarray()
    
    feature_names =cv.get_feature_names_out()
    print('Feature Count: ', len(feature_names))
    
    dataframe = pd.DataFrame(data_cv_array, columns = feature_names)
    pickle.dump(cv, open('vectorizer', 'wb'))
    return dataframe , labels

vectorDataFrame , resumeData['Category'] = resume_vectorizer(resumeData['stopColumn'], resumeData['Category'])

print(vectorDataFrame.columns)
vectorDataFrame

# Models

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

x = vectorDataFrame
y = resumeData['Category']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 5 , stratify = y) 

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train,y_train)



In [None]:
predictions = xgb_classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of Model::",accuracy_score(y_test,predictions))

In [None]:
# import pickle
# filename = "xgb_model"

# pickle.dump(xgb_classifier , open(filename , 'wb'))

xgb_classifier.save_model("model.txt")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
labels = resumeData['Category'].unique()
confusionMatrix = confusion_matrix(y_test, predictions , labels=labels, normalize='pred')
heatMapData = pd.DataFrame(confusionMatrix , index=labels, columns=labels, )

sns.heatmap(heatMapData, annot = True, fmt = '.2f')
plt.title('Confusion matrix')
plt.xlabel('PRED')
plt.ylabel('REAL')
plt.show()