In [None]:
# Author of this NoteBook - Shaurya Uppal
## LinkedIn Profile Link: https://www.linkedin.com/in/shaurya-uppal/
## For Consultation Connect (1:1 call): topmate.io/shaurya

### Library Installation

In [None]:
# !pip3 install -U sentence-transformers

In [None]:
# import nltk
# nltk.download('stopwords')

### Import Library

In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import heapq
from sklearn.metrics.pairwise import cosine_similarity


toktok = ToktokTokenizer()
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


In [None]:
jobs_df = pd.read_csv('JobsData.csv')
jobs_df

Unnamed: 0,JobId,Job Title,Job Description,Company Name,Type of ownership
0,0,"Data Analyst, Center on Immigration and Justic...",Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,Nonprofit Organization
1,1,Quality Data Analyst,Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,Nonprofit Organization
2,2,"Senior Data Analyst, Insights & Analytics Team...",We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,Company - Private
3,3,Data Analyst,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,Subsidiary or Business Segment
4,4,Reporting Data Analyst,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,Company - Private
...,...,...,...,...,...
2247,2247,RQS - IHHA - 201900004460 -1q Data Security An...,Maintains systems to protect data from unautho...,"Avacend, Inc.\n2.5",Company - Private
2248,2248,Senior Data Analyst (Corporate Audit),Position:\nSenior Data Analyst (Corporate Audi...,Arrow Electronics\n2.9,Company - Public
2249,2249,"Technical Business Analyst (SQL, Data analytic...","Title: Technical Business Analyst (SQL, Data a...",Spiceorb,-1
2250,2250,"Data Analyst 3, Customer Experience",Summary\n\nResponsible for working cross-funct...,Contingent Network Services\n3.1,Company - Private


In [None]:
jobs_df['full_text'] = jobs_df['Job Title'] + ' ' + jobs_df['Job Description'] + ' ' + jobs_df['Company Name'] + ' ' + jobs_df['Type of ownership']
jobs_df['raw_full_text'] = jobs_df['Job Title'] + ' ' + jobs_df['Job Description'] + ' ' + jobs_df['Company Name'] + ' ' + jobs_df['Type of ownership']

In [None]:
df = jobs_df[['JobId', 'raw_full_text', 'full_text']].copy()

### Text Preprocessing

In [None]:
# products preprocessing

def cleanhtml(raw_html):
    '''
    Input: HTML Text 
    Output: Cleaned HTML Text
    <tag> </tag> removed and other cleaning
    '''
    cleantext = ''
    try:
        raw_html = raw_html.replace(" br ", " ")
        raw_html = raw_html.replace("nbsp"," ")
        raw_html = raw_html.replace("ndash"," ")
        raw_html = raw_html.replace("&rsquo;", ' ')
        raw_html = raw_html.replace("&trade;", ' ')
        raw_html = raw_html.replace("&amp",' ')
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, ' ', raw_html)
        cleantext = ' '.join(cleantext.split())
    except:
        print(raw_html)
    return cleantext

def text_preprocessing(text):
    text = text.lower()
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    text = ' '.join(text.split())
    return text

df['full_text'] = np.vectorize(cleanhtml)(df['full_text'])
df['full_text'] = np.vectorize(text_preprocessing)(df['full_text'])

In [None]:
df['full_text'][2]

'senior data analyst insights analytics team customer operations we’re looking for a senior data analyst who has a love of mentorship data visualization and generating actionable insights from raw data in this role you’ll have the opportunity to be an organizational influencer who will generate insights with a good degree of autonomy and partner with data science to grow deeper analytical skills you will be joining the insights analytics team a team tasked with developing insights and reporting to support our customers and advisors’ needs this team sits within the customer operations team but is also connected to the product organization in this role you will work mainly with customer operations stakeholders to set kpis and evaluate the effectiveness of current strategies and workflows you will be involved in many aspects of data operations from data auditing to building dashboards and analytical insights for example you will review the code of more junior analysts and organize coding 

In [None]:
def _lemma(text):
    try:
        doc = toktok.tokenize(text)
        text = [x for x in doc if x not in stop_words]
        text = [lemma.lemmatize(x) for x in doc]
        text = ' '.join(text)
    except:
        print(text)
    return text


df['full_text'] = np.vectorize(_lemma)(df['full_text'])

In [None]:
df

Unnamed: 0,JobId,raw_full_text,full_text
0,0,"Data Analyst, Center on Immigration and Justic...",data analyst center on immigration and justice...
1,1,Quality Data Analyst Overview\n\nProvides anal...,quality data analyst overview provides analyti...
2,2,"Senior Data Analyst, Insights & Analytics Team...",senior data analyst insight analytics team cus...
3,3,Data Analyst Requisition NumberRR-0001939\nRem...,data analyst requisition numberrr0001939 remot...
4,4,Reporting Data Analyst ABOUT FANDUEL GROUP\n\n...,reporting data analyst about fanduel group fan...
...,...,...,...
2247,2247,RQS - IHHA - 201900004460 -1q Data Security An...,rqs ihha 201900004460 1q data security analyst...
2248,2248,Senior Data Analyst (Corporate Audit) Position...,senior data analyst corporate audit position s...
2249,2249,"Technical Business Analyst (SQL, Data analytic...",technical business analyst sql data analytics ...
2250,2250,"Data Analyst 3, Customer Experience Summary\n\...",data analyst 3 customer experience summary res...


## Iteration1: TFIDF
### Lexical Similarity Model 
### Based on Syntactic Similarity 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1,2),stop_words='english', analyzer = 'word')
features = tfidf.fit_transform(df['full_text'].values.tolist())


In [None]:
df_tfidf = pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

In [None]:
similarity_matrix = cosine_similarity(df_tfidf, df_tfidf)
similarity_matrix.shape

(2252, 2252)

In [None]:
# top 10 similar items to an given item
heapq.nlargest(10, enumerate(similarity_matrix[0]), key=lambda x: x[1])

[(0, 1.0000000000000027),
 (57, 0.09480311836450478),
 (1132, 0.08352838894459505),
 (209, 0.08024223213056117),
 (542, 0.07973942104832707),
 (242, 0.07880632729834099),
 (415, 0.06781506048285647),
 (193, 0.06778610639333396),
 (87, 0.06577036603393609),
 (395, 0.0657389046981423)]

In [None]:
df['recommendation_list_tfidf'] = None
topK = 10
for indx in df.index:
    df.at[indx, 'recommendation_list_tfidf'] = [a for a,b in heapq.nlargest(topK+1, enumerate(similarity_matrix[df[df['JobId']==df.at[indx, 'JobId']].index[0]]), key=lambda x: x[1])][1:]

In [None]:
df

Unnamed: 0,JobId,raw_full_text,full_text,recommendation_list_tfidf
0,0,"Data Analyst, Center on Immigration and Justic...",data analyst center on immigration and justice...,"[57, 1132, 209, 542, 242, 415, 193, 87, 395, 408]"
1,1,Quality Data Analyst Overview\n\nProvides anal...,quality data analyst overview provides analyti...,"[281, 1649, 339, 829, 875, 668, 618, 1345, 277..."
2,2,"Senior Data Analyst, Insights & Analytics Team...",senior data analyst insight analytics team cus...,"[1523, 797, 1608, 775, 552, 974, 1033, 1915, 6..."
3,3,Data Analyst Requisition NumberRR-0001939\nRem...,data analyst requisition numberrr0001939 remot...,"[1883, 165, 631, 234, 305, 2230, 741, 2224, 64..."
4,4,Reporting Data Analyst ABOUT FANDUEL GROUP\n\n...,reporting data analyst about fanduel group fan...,"[1588, 1178, 1890, 653, 385, 1983, 681, 252, 1..."
...,...,...,...,...
2247,2247,RQS - IHHA - 201900004460 -1q Data Security An...,rqs ihha 201900004460 1q data security analyst...,"[2223, 2239, 2204, 2240, 2243, 2221, 1539, 177..."
2248,2248,Senior Data Analyst (Corporate Audit) Position...,senior data analyst corporate audit position s...,"[1466, 1580, 1835, 2086, 567, 1454, 1882, 2072..."
2249,2249,"Technical Business Analyst (SQL, Data analytic...",technical business analyst sql data analytics ...,"[2245, 2216, 2185, 1197, 1194, 2062, 1975, 952..."
2250,2250,"Data Analyst 3, Customer Experience Summary\n\...",data analyst 3 customer experience summary res...,"[2233, 2212, 679, 1518, 1624, 741, 2072, 544, ..."


## Iteration 2: Text Similarity based on Semantics

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df.full_text.values.tolist())]

In [None]:
model = Doc2Vec(documents, vector_size=40, window=3, min_count=2, workers=4, epochs=50, negative= 5, seed = 0)

In [None]:
# model.save("d2v.model")

In [None]:
model.dv.most_similar(0, topn=10)

[(1049, 0.8217266201972961),
 (973, 0.7813266515731812),
 (1560, 0.7508060932159424),
 (1446, 0.7015277743339539),
 (1930, 0.6908974647521973),
 (559, 0.6859457492828369),
 (571, 0.683448076248169),
 (1044, 0.6799439191818237),
 (484, 0.6780128479003906),
 (987, 0.6734771132469177)]

In [None]:
print(documents[0])
print(documents[1049])

TaggedDocument(data analyst center on immigration and justice cij are you eager to roll up your sleeve and harness data to drive policy change do you enjoy sifting through complex datasets to illuminate trend and insight do you see yourself working for a valuesdriven organization with a vision to tackle the most pressing injustice of our day we are looking to hire a bright hardworking and creative individual with strong data management skill and a demonstrated commitment to immigrant right the data analyst will assist with analysis and reporting need for veras center on immigration and justice cij working across it current project and future vera initiative who we are founded in 1961 the vera institute is an independent nonpartisan nonprofit organization that combine expertise in research technical assistance and demonstration project to assist leader in government and civil society examine justice policy and practice and improve the system people rely on for justice and safety we stud

In [None]:
df['recommendation_list_Doc2Vec'] = None
for indx in df.index:
    df.at[indx, 'recommendation_list_Doc2Vec'] = [item for item, score in model.dv.most_similar(df.at[indx, 'JobId'], topn=10)]

df

Unnamed: 0,JobId,raw_full_text,full_text,recommendation_list_tfidf,recommendation_list_Doc2Vec
0,0,"Data Analyst, Center on Immigration and Justic...",data analyst center on immigration and justice...,"[57, 1132, 209, 542, 242, 415, 193, 87, 395, 408]","[1049, 973, 1560, 1446, 1930, 559, 571, 1044, ..."
1,1,Quality Data Analyst Overview\n\nProvides anal...,quality data analyst overview provides analyti...,"[281, 1649, 339, 829, 875, 668, 618, 1345, 277...","[250, 668, 1618, 467, 1029, 2187, 613, 1528, 1..."
2,2,"Senior Data Analyst, Insights & Analytics Team...",senior data analyst insight analytics team cus...,"[1523, 797, 1608, 775, 552, 974, 1033, 1915, 6...","[514, 1538, 1323, 1261, 1085, 1906, 565, 222, ..."
3,3,Data Analyst Requisition NumberRR-0001939\nRem...,data analyst requisition numberrr0001939 remot...,"[1883, 165, 631, 234, 305, 2230, 741, 2224, 64...","[234, 95, 1328, 292, 2183, 1356, 654, 327, 115..."
4,4,Reporting Data Analyst ABOUT FANDUEL GROUP\n\n...,reporting data analyst about fanduel group fan...,"[1588, 1178, 1890, 653, 385, 1983, 681, 252, 1...","[1845, 1047, 513, 627, 1932, 2159, 844, 2157, ..."
...,...,...,...,...,...
2247,2247,RQS - IHHA - 201900004460 -1q Data Security An...,rqs ihha 201900004460 1q data security analyst...,"[2223, 2239, 2204, 2240, 2243, 2221, 1539, 177...","[1352, 284, 701, 2239, 1515, 1317, 1803, 593, ..."
2248,2248,Senior Data Analyst (Corporate Audit) Position...,senior data analyst corporate audit position s...,"[1466, 1580, 1835, 2086, 567, 1454, 1882, 2072...","[773, 750, 1894, 501, 19, 155, 2019, 1209, 208..."
2249,2249,"Technical Business Analyst (SQL, Data analytic...",technical business analyst sql data analytics ...,"[2245, 2216, 2185, 1197, 1194, 2062, 1975, 952...","[2245, 1914, 2011, 741, 174, 479, 187, 1952, 1..."
2250,2250,"Data Analyst 3, Customer Experience Summary\n\...",data analyst 3 customer experience summary res...,"[2233, 2212, 679, 1518, 1624, 741, 2072, 544, ...","[2233, 2212, 794, 1503, 1994, 1190, 397, 2065,..."


## Iteration3: Text similarity with Contextual Embedding (BERT)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

## Bert is a contextual algorithm it doesnot require Text Preprocessing and Lemma
corpus = df.raw_full_text.values.tolist()

In [None]:
# encode corpus

sentence_embeddings = model.encode(corpus)

In [None]:
sentence_embeddings.shape

(2252, 768)

In [None]:
similarity_matrix_bert = cosine_similarity(sentence_embeddings, sentence_embeddings)

In [None]:
df['recommendation_list_bert'] = None
topK = 10
for indx in df.index:
    df.at[indx, 'recommendation_list_bert'] = [a for a,b in heapq.nlargest(topK+1, enumerate(similarity_matrix[df[df['JobId']==df.at[indx, 'JobId']].index[0]]), key=lambda x: x[1])][1:]


In [None]:
df

Unnamed: 0,JobId,raw_full_text,full_text,recommendation_list_tfidf,recommendation_list_Doc2Vec,recommendation_list_bert
0,0,"Data Analyst, Center on Immigration and Justic...",data analyst center on immigration and justice...,"[57, 1132, 209, 542, 242, 415, 193, 87, 395, 408]","[1049, 973, 1560, 1446, 1930, 559, 571, 1044, ...","[57, 1132, 209, 542, 242, 415, 193, 87, 395, 408]"
1,1,Quality Data Analyst Overview\n\nProvides anal...,quality data analyst overview provides analyti...,"[281, 1649, 339, 829, 875, 668, 618, 1345, 277...","[250, 668, 1618, 467, 1029, 2187, 613, 1528, 1...","[281, 1649, 339, 829, 875, 668, 618, 1345, 277..."
2,2,"Senior Data Analyst, Insights & Analytics Team...",senior data analyst insight analytics team cus...,"[1523, 797, 1608, 775, 552, 974, 1033, 1915, 6...","[514, 1538, 1323, 1261, 1085, 1906, 565, 222, ...","[1523, 797, 1608, 775, 552, 974, 1033, 1915, 6..."
3,3,Data Analyst Requisition NumberRR-0001939\nRem...,data analyst requisition numberrr0001939 remot...,"[1883, 165, 631, 234, 305, 2230, 741, 2224, 64...","[234, 95, 1328, 292, 2183, 1356, 654, 327, 115...","[1883, 165, 631, 234, 305, 2230, 741, 2224, 64..."
4,4,Reporting Data Analyst ABOUT FANDUEL GROUP\n\n...,reporting data analyst about fanduel group fan...,"[1588, 1178, 1890, 653, 385, 1983, 681, 252, 1...","[1845, 1047, 513, 627, 1932, 2159, 844, 2157, ...","[1588, 1178, 1890, 653, 385, 1983, 681, 252, 1..."
...,...,...,...,...,...,...
2247,2247,RQS - IHHA - 201900004460 -1q Data Security An...,rqs ihha 201900004460 1q data security analyst...,"[2223, 2239, 2204, 2240, 2243, 2221, 1539, 177...","[1352, 284, 701, 2239, 1515, 1317, 1803, 593, ...","[2223, 2239, 2204, 2240, 2243, 2221, 1539, 177..."
2248,2248,Senior Data Analyst (Corporate Audit) Position...,senior data analyst corporate audit position s...,"[1466, 1580, 1835, 2086, 567, 1454, 1882, 2072...","[773, 750, 1894, 501, 19, 155, 2019, 1209, 208...","[1466, 1580, 1835, 2086, 567, 1454, 1882, 2072..."
2249,2249,"Technical Business Analyst (SQL, Data analytic...",technical business analyst sql data analytics ...,"[2245, 2216, 2185, 1197, 1194, 2062, 1975, 952...","[2245, 1914, 2011, 741, 174, 479, 187, 1952, 1...","[2245, 2216, 2185, 1197, 1194, 2062, 1975, 952..."
2250,2250,"Data Analyst 3, Customer Experience Summary\n\...",data analyst 3 customer experience summary res...,"[2233, 2212, 679, 1518, 1624, 741, 2072, 544, ...","[2233, 2212, 794, 1503, 1994, 1190, 397, 2065,...","[2233, 2212, 679, 1518, 1624, 741, 2072, 544, ..."


#### To evaluate these methods in production, we can perform A-B Testing and check business metrics ClickThroughRate.
#### If DataOps manual team support is possible, we can have ranking metrics of recommendation method: Mean Reciprocal Rank, Mean Average Precision and nDCG for each of the method. 

##### Evaluation Proxy: 
    Score = (Summation(Number of Items within top10 have product category = reference product_category / Number of Items (in our case 10) ))