In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json

## Data Pre-processing - Load Data, Clean Data

### Read Resume Dataset

In [2]:
df_resume = pd.read_csv('./dataset/resume_dataset.csv' ,encoding='utf-8')
df_resume.head(3)

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."


### Rename Column Names

In [3]:
df_resume = df_resume.rename(columns={'Category': 'category', 'Resume': 'resume'})
df_resume.head(3)

Unnamed: 0,category,resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."


### Functions to Clean resume and job description - 2 steps (2 functions)

In [4]:
import re

# clean step 1
def cleanResume(resumeText):
    # Step 1
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace

    # Step 2
    resumeText = re.sub('[^A-Za-z]', ' ', resumeText)
    resumeText = " ".join(resumeText.split()).lower() #remove extra and trailing space + map to lower case


    # Step 3 - Clean Stopwords - Using NLTK Lib
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(resumeText)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    #with no lower case conversion
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    final_resumeText = ' '.join(filtered_sentence)
    
    return final_resumeText



### Apply Clean Function on Dataset

In [5]:
# clean step 1
df_resume['final_cleaned_resume'] = df_resume.resume.apply(lambda x: cleanResume(x))

# # clean step 2
# df_resume = df_resume.assign(final_cleaned_resume = df_resume.step1_cleaned_resume.apply(clean_sentence))
df_resume.head(5)

Unnamed: 0,category,resume,final_cleaned_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may b e uit rgpv data sc...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [6]:
# df_resume['final_cleaned_resume'][0]

## Using SBERT.net - Link: https://www.sbert.net/

### Install SBERT

In [7]:
# pip install -U sentence-transformers

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')

sentence_1 = ['This is a sentence.']
sentence_2 = ['This is a sentence!']

#Sentences are encoded by calling model.encode()
embedding_1 = model.encode(sentence_1)
embedding_2 = model.encode(sentence_2)

# print(embedding_1)
# print(embedding_2)

cosine_similarity(embedding_1, embedding_2)

array([[0.8843454]], dtype=float32)

### Read Job Description Dataset

In [9]:
df_jd = pd.read_excel('./dataset/job_description.xlsx')

df_jd_it_job_only = df_jd[df_jd['Job_Category'] == 'IT'] # dataframe job related to IT

### Rename Column Names

In [10]:
df_jd = df_jd[['Position', 'Job_Description']]
df_jd = df_jd.rename(columns={'Position': 'position', 'Job_Description': 'job_description'})
df_jd.head(3)

Unnamed: 0,position,job_description
0,Poster Designer,Japnese language understanding. Video editing ...
1,BOA-Registered Architect/ Senior Architect,2 positions for cambodia boa registered archit...
2,Interior Architect/ Intermediate Interior Desi...,The company is seeking an experienced INTERIOR...


### Clean Job Description Dataset

In [11]:
# clean job description
df_jd['final_cleaned_jd'] = df_jd.job_description.apply(lambda x: cleanResume(x))
df_jd.head(3)

Unnamed: 0,position,job_description,final_cleaned_jd
0,Poster Designer,Japnese language understanding. Video editing ...,japnese language understanding video editing a...
1,BOA-Registered Architect/ Senior Architect,2 positions for cambodia boa registered archit...,positions cambodia boa registered architect se...
2,Interior Architect/ Intermediate Interior Desi...,The company is seeking an experienced INTERIOR...,company seeking experienced interior architect...


In [12]:
df_jd = df_jd.head(1000)

### 1 Test Case - Compare 1 Resume and 1 Job Description

In [13]:
resume_description = df_resume.head(1)['final_cleaned_resume']
sentence_1 = [resume_description[0]]
print("Resume 1:", sentence_1)

job_description = df_jd.head(1)['final_cleaned_jd']
sentence_2 = [job_description[0]]
print("Job Description:", sentence_2)

embedding_1 = model.encode(sentence_1)
embedding_2 = model.encode(sentence_2)

Resume 1: ['skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm na bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch js dc js plotly kibana matplotlib ggplot tableau others regular expression html css angular logstash kafka python flask git docker computer vision open cv understanding deep learning education details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience months jquery exprience months python exprience monthscompany details company ernst young llp description fraud investigations dispute services assurance technology assisted review tar technology assisted review assists elerating review process run analyti

In [14]:
similarity_rate = cosine_similarity(embedding_1, embedding_2)
print("==============================================================+")
print("Resume-JobDescription Similarity (CosineSimilarity):", similarity_rate[0][0])
print("==============================================================+")

Resume-JobDescription Similarity (CosineSimilarity): 0.10676937


### 2 Test Case - Compare 5 Resume to 1 Job Description - Random

In [15]:
resume_5 = df_resume.sample(n=5)
resume_5

Unnamed: 0,category,resume,final_cleaned_resume
337,Java Developer,Education Details \r\nJanuary 2013 Master of E...,education details january master engineering i...
336,Java Developer,TECHNICAL STRENGTHS Computer Language Java/J2E...,technical strengths computer language java j e...
311,Civil Engineer,COMPUTER KNOWLEDGE â¢ Drafting tools: AutoCAD...,computer knowledge drafting tools autocad pack...
57,HR,Education Details \r\n MBA ACN College of en...,education details mba acn college engineering ...
898,Testing,Skill Set OS Windows XP/7/8/8.1/10 Database MY...,skill set os windows xp database mysql sql ser...


In [16]:
# job_description = df_jd.head(1)['final_cleaned_jd']
job_description = df_jd.sample(n=1)
job_description

jd = ''
for index, row in job_description.iterrows():
    jd = row['final_cleaned_jd']

In [17]:
ranking_dict = {}

for index, row in resume_5.iterrows():
    print("Resume           :", row['final_cleaned_resume'])
    print("Job Description  :", jd)
    resume = row['final_cleaned_resume']
    
    embedding_1 = model.encode([resume])
    embedding_2 = model.encode([jd])

    similarity_rate = cosine_similarity(embedding_1, embedding_2)
    print("Similarity Rate  :", format(similarity_rate[0][0]*100, '.2f')+'%')
    print()
    ranking_dict[index] = float(format(similarity_rate[0][0]*100, '.2f'))

print(ranking_dict)

Resume           : education details january master engineering information technology pune maharashtra january bachelor engineering information technology pusad maharashtra amravati university january pusad maharashtra p n junior college january c pusad maharashtra k high school java developer java developer maxgen technologies skill details company details company maxgen technologies description currently working infrasoft technologies andheri java developer company mis generation tata sky tata power description courses done android mobile app development technologies java core java advance java jsf hibernate spring niit android project location detector computing mobile devices android project data deduplication projects works reduce redundant data system free memory stores unique copy data location data help pointers ess data java subjects taught c language core java object oriented programming oot c database pps programming problem solving ad advance database ias information assur

In [18]:
sorted_ranking_resume = dict(sorted(ranking_dict.items(), key=lambda item: item[1], reverse=True))
sorted_ranking_resume

{57: 44.06, 898: 41.18, 311: 38.46, 336: 35.05, 337: 31.81}

In [19]:
for key in sorted_ranking_resume:
    print('Index:', key, '-' , str(sorted_ranking_resume[key]) + '%')

Index: 57 - 44.06%
Index: 898 - 41.18%
Index: 311 - 38.46%
Index: 336 - 35.05%
Index: 337 - 31.81%


### 3 Test Case - Compare 5 Resume to 1 Job Description - Static

In [20]:
first_5_resume = df_resume.head(5)

first_jd = df_jd.head(1)
jd = first_jd['final_cleaned_jd'][0]

In [21]:
ranking_dict = {}

for index, row in first_5_resume.iterrows():
    print("Resume           :", row['final_cleaned_resume'])
    print("Job Description  :", first_jd['final_cleaned_jd'][0])
    resume = row['final_cleaned_resume']
    
    embedding_1 = model.encode([resume])
    embedding_2 = model.encode([jd])

    similarity_rate = cosine_similarity(embedding_1, embedding_2)
    print("Similarity Rate  :", format(similarity_rate[0][0]*100, '.2f')+'%')
    print()
    ranking_dict[index] = float(format(similarity_rate[0][0]*100, '.2f'))

print(ranking_dict)

Resume           : skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm na bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch js dc js plotly kibana matplotlib ggplot tableau others regular expression html css angular logstash kafka python flask git docker computer vision open cv understanding deep learning education details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience months jquery exprience months python exprience monthscompany details company ernst young llp description fraud investigations dispute services assurance technology assisted review tar technology assisted review assists elerating review process run 

In [22]:
sorted_ranking_resume = dict(sorted(ranking_dict.items(), key=lambda item: item[1], reverse=True))
sorted_ranking_resume

{0: 10.68, 3: 8.52, 4: 8.47, 2: 6.13, 1: 3.94}

In [23]:
for key in sorted_ranking_resume:
    print('Index:', key, '-' , str(sorted_ranking_resume[key]) + '%')

Index: 0 - 10.68%
Index: 3 - 8.52%
Index: 4 - 8.47%
Index: 2 - 6.13%
Index: 1 - 3.94%


### 4 Test Case - Compare Resume and Job Description only job related to "IT"

In [24]:
df_jd_it_job_only = df_jd_it_job_only[['Position', 'Job_Description']]
df_jd_it_job_only = df_jd_it_job_only.rename(columns={'Position': 'position', 'Job_Description': 'job_description'})
df_jd_it_job_only.head(3)

Unnamed: 0,position,job_description
95,Accelerated Skills & Knowledge,Introduction. Amret has developed a dedicated ...
675,Accounting & System Coordinator,Daily support on system issue error from users...
991,Acquiring & Ecommerce Officer,Key responsibilities. Onsite offsite training ...


In [25]:
# clean step 1
df_jd_it_job_only['final_cleaned_jd'] = df_jd_it_job_only.job_description.apply(lambda x: cleanResume(x))

# clean step 2
# df_jd_it_job_only = df_jd_it_job_only.assign(final_cleaned_jd = df_jd_it_job_only.step1_cleaned_jd.apply(clean_sentence))
# df_jd_it_job_only.head(3)

In [26]:
first_5_resume = df_resume.head(5)
first_5_resume

Unnamed: 0,category,resume,final_cleaned_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may b e uit rgpv data sc...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [27]:
first_jd = df_jd_it_job_only.head(1)

jd = ''
for index, row in first_jd.iterrows():
    jd = row['final_cleaned_jd']

jd

'introduction amret developed dedicated program students studying year fresh graduate field information technology get good opportunity skills practice learning classroom well get coaching directly professionals many year experience join elerated skills knowledge ask program practical skill program gain professional working experience directly coached professionals attractive allowance monthly allowance per month plus national social security fund nssf job opportunity amret qualified competent interns prioritized available job amret'

In [28]:
ranking_dict = {}

for index, row in first_5_resume.iterrows():
    print("Resume           :", row['final_cleaned_resume'])
    print("Job Description  :", jd)
    resume = row['final_cleaned_resume']
    
    embedding_1 = model.encode([resume])
    embedding_2 = model.encode([jd])

    similarity_rate = cosine_similarity(embedding_1, embedding_2)
    print("Similarity Rate  :", format(similarity_rate[0][0]*100, '.2f')+'%')
    print()
    ranking_dict[index] = float(format(similarity_rate[0][0]*100, '.2f'))

print(ranking_dict)

Resume           : skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm na bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch js dc js plotly kibana matplotlib ggplot tableau others regular expression html css angular logstash kafka python flask git docker computer vision open cv understanding deep learning education details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience months jquery exprience months python exprience monthscompany details company ernst young llp description fraud investigations dispute services assurance technology assisted review tar technology assisted review assists elerating review process run 

In [29]:
sorted_ranking_resume = dict(sorted(ranking_dict.items(), key=lambda item: item[1], reverse=True))
sorted_ranking_resume

{4: 49.66, 2: 46.38, 1: 40.23, 3: 38.09, 0: 37.62}

In [30]:
for key in sorted_ranking_resume:
    print('Index:', key, '-' , str(sorted_ranking_resume[key]) + '%')

Index: 4 - 49.66%
Index: 2 - 46.38%
Index: 1 - 40.23%
Index: 3 - 38.09%
Index: 0 - 37.62%


# Prepare Input from Web and give response - JSON format

### Dump Json Input

In [31]:
data = {
    "job_description" : "This is job description from test CADT.",
    "resume" : [
        {
            "name" : "Ronaldo",
            "resume" : "This is resume 1. Python"
        },
        {
            "name" : "Messi",
            "resume" : "This is resume 2. Java"
        },
        {
            "name" : "Neymar",
            "resume" : "This is resume 3. C++"
        },
        {
            "name" : "Sophal",
            "resume" : "This is resume 3. Laravel, Python"
        }
    ]
}

data

{'job_description': 'This is job description from test CADT.',
 'resume': [{'name': 'Ronaldo', 'resume': 'This is resume 1. Python'},
  {'name': 'Messi', 'resume': 'This is resume 2. Java'},
  {'name': 'Neymar', 'resume': 'This is resume 3. C++'},
  {'name': 'Sophal', 'resume': 'This is resume 3. Laravel, Python'}]}

In [32]:
jd = data['job_description']

In [33]:
data['resume']

[{'name': 'Ronaldo', 'resume': 'This is resume 1. Python'},
 {'name': 'Messi', 'resume': 'This is resume 2. Java'},
 {'name': 'Neymar', 'resume': 'This is resume 3. C++'},
 {'name': 'Sophal', 'resume': 'This is resume 3. Laravel, Python'}]

In [34]:
df_input_resume = pd.DataFrame(data['resume'])
df_input_resume

Unnamed: 0,name,resume
0,Ronaldo,This is resume 1. Python
1,Messi,This is resume 2. Java
2,Neymar,This is resume 3. C++
3,Sophal,"This is resume 3. Laravel, Python"


#### Clean Input Resume Dataset and JD - And Apply Similarity Calculation and Ranking in separated functions

In [35]:
jd = cleanResume(jd)
jd

'job description test cadt'

In [36]:
# clean step 1
df_input_resume['final_cleaned_resume'] = df_input_resume.resume.apply(lambda x: cleanResume(x))
df_input_resume.head(5)

Unnamed: 0,name,resume,final_cleaned_resume
0,Ronaldo,This is resume 1. Python,resume python
1,Messi,This is resume 2. Java,resume java
2,Neymar,This is resume 3. C++,resume c
3,Sophal,"This is resume 3. Laravel, Python",resume laravel python


In [37]:
def calculate_similarity(resume, jd):
    embedding_1 = model.encode([resume])
    embedding_2 = model.encode([jd])

    similarity_rate = cosine_similarity(embedding_1, embedding_2)
    similarity_rate_percentage = format(similarity_rate[0][0]*100, '.2f')
    # print("Similarity Rate  :", format(similarity_rate[0][0]*100, '.2f')+'%')
    
    return [float(similarity_rate), float(similarity_rate_percentage)]


response = []

for index, row in df_input_resume.iterrows():
    resume = row['final_cleaned_resume']

    print(row['name'], " = ", calculate_similarity(resume, jd)[1], "%")

    tmp = []
    tmp.append(['name', row['name']])
    tmp.append(['rate', calculate_similarity(resume, jd)[0]])
    tmp.append(['percentage', calculate_similarity(resume, jd)[1]])
    response.append(tmp)

response

Ronaldo  =  20.89 %


Messi  =  19.75 %
Neymar  =  32.02 %
Sophal  =  15.37 %


[[['name', 'Ronaldo'], ['rate', 0.20889776945114136], ['percentage', 20.89]],
 [['name', 'Messi'], ['rate', 0.19745372235774994], ['percentage', 19.75]],
 [['name', 'Neymar'], ['rate', 0.32023486495018005], ['percentage', 32.02]],
 [['name', 'Sophal'], ['rate', 0.153700053691864], ['percentage', 15.37]]]

In [38]:
sorted_response = sorted(response, key=lambda x: x[2], reverse=True)
response = sorted_response

response

[[['name', 'Neymar'], ['rate', 0.32023486495018005], ['percentage', 32.02]],
 [['name', 'Ronaldo'], ['rate', 0.20889776945114136], ['percentage', 20.89]],
 [['name', 'Messi'], ['rate', 0.19745372235774994], ['percentage', 19.75]],
 [['name', 'Sophal'], ['rate', 0.153700053691864], ['percentage', 15.37]]]

### Resume Ranking System

In [39]:
def resume_ranking_system(df_input_resume, jd):
    response = []
    for index, row in df_input_resume.iterrows():

        resume = row['final_cleaned_resume']

        embedding_1 = model.encode([resume])
        embedding_2 = model.encode([jd])

        similarity_rate = cosine_similarity(embedding_1, embedding_2)
        similarity_rate_percentage = format(similarity_rate[0][0]*100, '.2f')

        tmp = []
        tmp.append(['name', row['name']])
        tmp.append(['rate', float(similarity_rate)])
        tmp.append(['percentage', float(similarity_rate_percentage)])
        response.append(tmp)
    
    sorted_response = sorted(response, key=lambda x: x[2], reverse=True)
    response = sorted_response

    return response
    

In [40]:
resume_ranking_system(df_input_resume, jd)

[[['name', 'Neymar'], ['rate', 0.32023486495018005], ['percentage', 32.02]],
 [['name', 'Ronaldo'], ['rate', 0.20889776945114136], ['percentage', 20.89]],
 [['name', 'Messi'], ['rate', 0.19745372235774994], ['percentage', 19.75]],
 [['name', 'Sophal'], ['rate', 0.153700053691864], ['percentage', 15.37]]]