In [23]:
import os
import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import copy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [24]:
stemmer = WordNetLemmatizer()
CURRENT_DIR = os.getcwd()
RESUME_DIR = os.path.join(CURRENT_DIR, 'Resume-DataSet')
MODELS = os.path.join(CURRENT_DIR, 'Models')
JD_DIR = os.path.join(CURRENT_DIR, 'JobDescriptions')

# making Stop Word lists
file_name = CURRENT_DIR + "\Stopword-List.txt"
f = open(file_name, "r", encoding='UTF8')
stop_words_list = f.read()
stop_words_list = ' '.join(stop_words_list.split()).split(' ')
stop_words_list.append(' ')
f.close()

### Reading DataSet to Create A Model

In [25]:
filePath = RESUME_DIR + '\\UpdatedResumeDataSet.csv'
df = pd.read_csv(filePath)

### Preprocessing the Resumes

In [26]:
def preprocessing(X):
    list_of_preprocessed_resumes = None
    list_of_preprocessed_resumes = []
    for index in range(0,len(X)):
        # Remove all the special characters
        resume = re.sub(r'\W', ' ', str(X[index]))

        # remove all single characters
        resume = re.sub(r'\s+[a-zA-Z]\s+', ' ', resume)

        # Remove single characters from the start
        resume = re.sub(r'\^[a-zA-Z]\s+', ' ', resume) 

        # Substituting multiple spaces with single space
        resume = re.sub(r'\s+', ' ', resume, flags=re.I)

        # Removing prefixed 'b'
        resume = re.sub(r'^b\s+', '', resume)

        # Converting to Lowercase
        resume = resume.lower()

        # Lemmatization
        resume = resume.split()

        resume = [stemmer.lemmatize(word) for word in resume]
        resume = ' '.join(resume)

        list_of_preprocessed_resumes.append(resume)
    return copy.deepcopy(list_of_preprocessed_resumes)

In [27]:
X,y = df['Resume'], df['Category']
list_of_preprocessed_resumes = preprocessing(X)

### Making TF-IDF Vectors of Resumes

In [28]:
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=10, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(list_of_preprocessed_resumes).toarray()

### Train-Test Split

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=0)

### Resume Classification Model Using SVC

In [30]:
model = LinearSVC()
model.fit(X_train,y_train)

LinearSVC()

In [31]:
y_pred = model.predict(X_test)

print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00        12
                     Arts       1.00      1.00      1.00        23
       Automation Testing       0.95      1.00      0.98        21
               Blockchain       1.00      1.00      1.00        36
         Business Analyst       1.00      1.00      1.00        23
           Civil Engineer       1.00      1.00      1.00        20
             Data Science       1.00      1.00      1.00        31
                 Database       1.00      1.00      1.00        23
          DevOps Engineer       1.00      0.90      0.95        42
         DotNet Developer       0.95      1.00      0.97        19
            ETL Developer       1.00      1.00      1.00        29
   Electrical Engineering       1.00      1.00      1.00        26
                       HR       1.00      1.00      1.00        32
                   Hadoop       1.00      1.00      1.00     

### Resume Classification Model Using Random Forest

In [32]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [33]:
y_pred2 = classifier.predict(X_test)
print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test, y_pred2))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00        12
                     Arts       1.00      1.00      1.00        23
       Automation Testing       1.00      0.71      0.83        21
               Blockchain       1.00      0.78      0.88        36
         Business Analyst       1.00      0.91      0.95        23
           Civil Engineer       1.00      1.00      1.00        20
             Data Science       0.79      1.00      0.89        31
                 Database       0.92      1.00      0.96        23
          DevOps Engineer       1.00      0.90      0.95        42
         DotNet Developer       0.95      1.00      0.97        19
            ETL Developer       1.00      1.00      1.00        29
   Electrical Engineering       1.00      0.77      0.87        26
                       HR       0.84      1.00      0.91        32
                   Hadoop       1.00      1.00      1.00     

### Resume Classification Model Using Logistic Regression

In [34]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
y_pred3 = logmodel.predict(X_test)

In [35]:
print(classification_report(y_test,y_pred3))
print(accuracy_score(y_test, y_pred3))

                           precision    recall  f1-score   support

                 Advocate       1.00      0.67      0.80        12
                     Arts       0.85      1.00      0.92        23
       Automation Testing       1.00      0.38      0.55        21
               Blockchain       1.00      0.58      0.74        36
         Business Analyst       1.00      0.39      0.56        23
           Civil Engineer       1.00      0.65      0.79        20
             Data Science       0.79      1.00      0.89        31
                 Database       1.00      1.00      1.00        23
          DevOps Engineer       1.00      0.90      0.95        42
         DotNet Developer       0.95      1.00      0.97        19
            ETL Developer       1.00      1.00      1.00        29
   Electrical Engineering       1.00      1.00      1.00        26
                       HR       1.00      1.00      1.00        32
                   Hadoop       1.00      1.00      1.00     

## Performance Report
### Linear SVM gives 96% Accuracy while Random Forest gives 94% Accuracy and Logistic Regression gives 90% Accuracy.

#### Saving the Models

In [36]:
with open(MODELS + '\\randomForest', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)
    
with open(MODELS + '\\svm', 'wb') as picklefile:
    pickle.dump(model,picklefile)

with open(MODELS + '\\logisticRegression', 'wb') as picklefile:
    pickle.dump(logmodel,picklefile)   

### Loading the Linear SVM Model, since it was giving the best Accuracy.

In [37]:
with open(MODELS + '\\svm', 'rb') as model:
    svm = pickle.load(model)


### Classifying a New Resume

In [38]:
filePath = RESUME_DIR + '\\resume_check.csv'
df2 = pd.read_csv(filePath)

resume = df2['Resume']
preprocessed_resume = preprocessing(resume)

new_resume = tfidfconverter.transform(preprocessed_resume)

svm.predict(new_resume)

array(['DotNet Developer'], dtype=object)

### PREPROCESSING JOB DESCRIPTIONS
##### Predicting the category of the job description

In [39]:
jd = []
f2 = open(JD_DIR + '\job-description-SoftwareEngineer.txt',"r", encoding='UTF8', errors='ignore')
jd.append(f2.read())
preprocessed_jd = preprocessing(jd)

preprocessed_jd = tfidfconverter.transform(preprocessed_jd)
jd_category_prediction = svm.predict(preprocessed_jd)

print(jd_category_prediction[0])

Web Designing


### Content Based Recommendation System Using Cosine Similarity

In [67]:
filePath = RESUME_DIR + '\\UpdatedResumeDataSet.csv'
new_df = pd.read_csv(filePath)
resumes_of_predicted_jd_category = df[df['Category'] == jd_category_prediction[0]]['Resume']
resumes_of_predicted_jd_category = pd.DataFrame({'Resume': resumes_of_predicted_jd_category})
indexes = [index for index in range(len(resumes_of_predicted_jd_category))]

resumes_of_predicted_jd_category['Index'] = indexes
resumes_of_predicted_jd_category = resumes_of_predicted_jd_category.set_index('Index')

list_of_resumes_of_predicted_jd_category = preprocessing(resumes_of_predicted_jd_category['Resume'])

length = len(list_of_resumes_of_predicted_jd_category)

In [70]:
list_of_resumes_of_predicted_jd_category.append(preprocessed_jd)
list_of_resumes_of_predicted_jd_category = [str(resume) for resume in list_of_resumes_of_predicted_jd_category]
list_of_resumes_of_predicted_jd_category = list_of_resumes_of_predicted_jd_category[0:length]

In [78]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
resume_of_jd_category = vectorizer.fit_transform(list_of_resumes_of_predicted_jd_category)

cosine_similarities = linear_kernel(resume_of_jd_category[-1], resume_of_jd_category).flatten()
cosine_similarities = np.delete(cosine_similarities,-1)
len(cosine_similarities)

44

#### Recommending Top 10 Resume with respect to provided Job Description

In [80]:
list_of_tuples = []

for index, row in resumes_of_predicted_jd_category.iterrows():
    if index == len(cosine_similarities):
        break
    data = (round(cosine_similarities[index]*100,2), index, resume)
    list_of_tuples.append(data)


list_of_tuples = sorted(list_of_tuples, reverse = True)
for data in list_of_tuples[:11]:
    print(data[2], end = '\n\n\n=========================================================================================================\n\n\n')

IT SKILLS Languages: C (Basic), JAVA (Basic) Web Technologies: HTML5, CSS3, Bootstrap, JavaScript, jQuery, Corel Draw, Photoshop, Illustrator Databases: MySQL5.0 IDE & Tools: Sublime Text, Notepad Operating Systems: Windows XP, Windows 7Education Details 
September 2015 Bachelor of Engineer Information technology Nagpur, Maharashtra Nagpur University
May 2011 HSC Secondary & Higher Secondary  State Board of Secondary
June 2009 SSC Secondary & Higher Secondary  Maharashtra State Board of Secondary
Web and Graphics Designer 

Web and Graphics Designer - Virtuous Media Point, Pune
Skill Details 
BOOTSTRAP- Exprience - 24 months
HTML5- Exprience - 24 months
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
COREL DRAW- Exprience - 24 months
Adobe Photoshop- Exprience - 24 months
Adobe Illustrator- Exprience - 12 months
CSS3- Exprience - 24 monthsCompany Details 
company - Virtuous Media Point
description - 
company - CNC Web World
description - Internship Pr