In [1]:
# import necessary libs
import pandas as pd
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import joblib
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("data/job_title_n_description.csv",index_col=0)
df.rename(columns={"Title":"title","FullDescription":"full_description"},inplace=True)
df.head()

Unnamed: 0,title,full_description
0,Senior PHP Developer,A skilled Senior PHP Developer is required by ...
1,Business Development Manager,"The Company: Our client are a full service, cr..."
2,QA Engineer,An expanding software and consultancy services...
3,Web Developer,A leading ecommerce agency is looking to hire ...
4,Software Engineer,"Software Engineer C, C++, Java, UML, XAMP, Agi..."


In [3]:
def clean_job_title(title,char_lim=50):
    
    title = re.sub("[^a-zA-Z]"," ",title.lower()) # remove non alpha charac
    title = ' '.join(title.split()) # this will ensure no extra spaces
    title = title.replace("junior","") # remove junior/senior/lead titles
    title = title.replace("senior","")
    title = title.replace("lead","")
    title = title.replace("internal","")
    title = title.replace("external","")
    title = title.replace("graduate","")
    title = title.replace("entry","")
    title = title.split()
    title = [x for x in title if len(x)>2]
    title = ' '.join(title)
    
    return title[:char_lim] # return upto first 50 characters

In [4]:
# cleaning the job title
df['title'] = df.title.progress_apply(clean_job_title)

  0%|          | 0/4392 [00:00<?, ?it/s]

In [5]:
def job_description_preprocess(text):
    
    text = re.sub("[^a-zA-Z]"," ",text.lower()) # remove non alpha charac
    return text

In [6]:
# clean the job description
df['full_description'] = df.full_description.progress_apply(job_description_preprocess)

  0%|          | 0/4392 [00:00<?, ?it/s]

In [7]:
# defining the stop words list and lemmatizer
stop_words_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [8]:
def text_preprocessing(text):
    
    word_list = text.split()
    word_list = [lemmatizer.lemmatize(x,'v') for x in word_list] # lemmatize words
    word_list = [x for x in word_list if x not in stop_words_list] # remove stop words
    
    return word_list

In [9]:
# preprocessing the job description
df['processed_job_description'] = df['title']+" "+df['full_description']
df['processed_job_description'] = df.processed_job_description.progress_apply(text_preprocessing)

  0%|          | 0/4392 [00:00<?, ?it/s]

In [10]:
df.head()

Unnamed: 0,title,full_description,processed_job_description
0,php developer,a skilled senior php developer is required by ...,"[php, developer, skilled, senior, php, develop..."
1,business development manager,the company our client are a full service cr...,"[business, development, manager, company, clie..."
2,engineer,an expanding software and consultancy services...,"[engineer, expand, software, consultancy, serv..."
3,web developer,a leading ecommerce agency is looking to hire ...,"[web, developer, lead, ecommerce, agency, look..."
4,software engineer,software engineer c c java uml xamp agi...,"[software, engineer, software, engineer, c, c,..."


In [11]:
# building the custom word2vec model on the cleaned job description with skipgram technique
candidateTagWord2VecModel = Word2Vec(df.processed_job_description, min_count=2,sg=1)

In [12]:
candidateTagWord2VecModel.wv.most_similar('javascript',topn=5)

[('css', 0.8835114240646362),
 ('jquery', 0.8671587109565735),
 ('html', 0.8350889682769775),
 ('ajax', 0.8101151585578918),
 ('xhtml', 0.8021975755691528)]

In [13]:
candidateTagWord2VecModel.wv.n_similarity("machine learning","java")

0.6967604

In [14]:
candidateTagWord2VecModel.wv.n_similarity("machine learning","python")

0.79195184

In [15]:
skill_tag_space = np.unique(df.title.values).tolist()

In [16]:
def recommendTag(tag_score_dict,custom_tag):
    
    res = [k for k,v in tag_score_dict.items() if v==max(tag_score_dict.values()) and \
                                         all(x in custom_tag for x in k.split()) and\
                                         len(k.split())>=int(len(custom_tag.split())/2)]
    # recommending tag which has highest similarity score, ensuring the recommended key has
    # length of half of the tag entered
    if len(res)>0:
        return res[0]
    else:
        return max(tag_score_dict,key=tag_score_dict.get)

def CandidateTagging(input_tag):

    input_tag = re.sub('[^a-zA-Z]',' ',input_tag.lower())
    # execute custom word2vec model
    tagScoreDict = {}
    for tag in skill_tag_space:# java develop
        full_tag_score = []
        for tag_i in tag.split():# java
            tag_score = []
            for word in input_tag.split():# java
                tag_score.append(candidateTagWord2VecModel.wv.n_similarity(word,tag_i))
                # score of JAVA wrt java and develop (existing keys) is evaluated

            full_tag_score.append(max(tag_score))
            # max score is appended for each existing tag wrt tag entered
        tagScoreDict[tag] = max(full_tag_score)

    return recommendTag(tagScoreDict,input_tag)

In [17]:
job_descrip = "proficient in JavaScript, HTML, and CSS"

In [18]:
CandidateTagging(input_tag=job_descrip)

'javascript developer'

In [20]:
job_descrip = "experienced in data analysis"
CandidateTagging(input_tag=job_descrip)

'analyst'

In [34]:
job_descrip = "experienced in deep learning"
CandidateTagging(input_tag=job_descrip)

'deep learning engineer'