In [336]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [337]:
resume_data = pd.read_csv('/Users/prabalsingh/Downloads/UpdatedResumeDataSet.csv')

In [338]:
resume_data

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


# Data Cleaning

In [339]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-/.:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [340]:
resume_data['Cleaned_Resume'] = resume_data['Resume'].apply(lambda x:cleanResume(x))

In [341]:
#category_unique = list(resume_data['Category'].unique())

#category_unique

# Based on relevant roles closer to required skills of ML/NLP/DL/Python

In [342]:
categories_to_filter = ['Data Science','Business Analyst','Python Developer', 'Database', 'DevOps Engineer', 'Hadoop']
resume_data = resume_data[resume_data['Category'].isin(categories_to_filter)]

In [343]:
cat_groups = resume_data.groupby(['Category']) 

<b>developing a raw corpus</b>

In [344]:
raw_corpus = []
for i in categories_to_filter:
    raw_corpus.append(
        ''.join([indvidual_resumes for indvidual_resumes in list(cat_groups.get_group(i)['Cleaned_Resume'])]))

<b>final clean corpus</b>

In [345]:
def tagged_docs(raw_corpus, categories_to_filter):
    for cnt,i in enumerate(categories_to_filter):
        doc = raw_corpus[cnt].lower()
        yield(TaggedDocument(doc.split(), tags=[i]))
        
corpus = list(tagged_docs(raw_corpus,categories_to_filter))

# Model Development (Doc2Vec)
<b> results are exceedingly satisfactory, performance could be further improved with more data and more fine tuning</b>

In [346]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=150, min_count=7, epochs=80)

model.build_vocab(corpus)

model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [347]:
# job resumes most similar to Data Science Roles
model.docvecs.most_similar('Data Science')

  model.docvecs.most_similar('Data Science')


[('Python Developer', 0.6030455827713013),
 ('Business Analyst', 0.5907436609268188),
 ('Hadoop', 0.5851280689239502),
 ('Database', 0.5682947039604187),
 ('DevOps Engineer', 0.5440165400505066)]

# Example

In [348]:
def preprocess(unseen):
    unseen = cleanResume(unseen).lower().split()
    return unseen

In [349]:
# Other skills in the company provided dataset
unseen_doc = 'Bootstrap, C Programming, C++ Programming, CSS, Computer Networks, Data Analytics, Data Science, Data Structures, Database Management System (DBMS), Deep Learning, Django, English Proficiency (Spoken), English Proficiency (Written), Git, Git Bash, HTML, Hindi Proficiency (Spoken), Java, Machine Learning, Microsoft Azure, Python, SQL, Computer Vision, Image Processing, JSP, JavaScript, MATLAB, MySQL, PHP, Servlets, Visual Basic (VB), XML'

In [350]:
preprocessed_doc = preprocess(unseen_doc)
inferred_vector = model.infer_vector(preprocessed_doc)
model.docvecs.most_similar([inferred_vector])

  model.docvecs.most_similar([inferred_vector])


[('Data Science', 0.2650614082813263),
 ('Python Developer', 0.20749686658382416),
 ('DevOps Engineer', 0.18713609874248505),
 ('Hadoop', 0.1814700961112976),
 ('Business Analyst', 0.13544459640979767),
 ('Database', 0.036313220858573914)]

# Model Save

In [351]:
import pickle

trained_model = model  # trained Doc2Vec model object

pickle_filepath = 'Digipplus Assignment/Doc2Vec_SkillScreen.pkl'

# Save the model using pickle
with open(pickle_filepath, 'wb') as f:
    pickle.dump(trained_model, f)

print("Doc2Vec model saved as pickle successfully.")

Doc2Vec model saved as pickle successfully.
