In [90]:
import pandas as pd
import numpy as np

from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.models.phrases import Phraser, Phrases

In [3]:
df = pd.read_csv('job_ofer.csv')

In [6]:
df.head(5)

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


In [12]:
title_preproc = df['title'].map(simple_preprocess)
title_preproc

0                            [machine, learning, engineer]
1           [deep, learning, applied, researcher, chicago]
2                            [machine, learning, engineer]
3                     [machine, learning, data, scientist]
4                                       [cloud, architect]
5                                        [data, scientist]
6                                     [store, room, clerk]
7                              [director, of, product, us]
8               [recruiting, manager, ad, census, ext, gb]
9        [bilingual, engineer, german, germany, or, swi...
10                                             [sommelier]
11       [entry, level, project, manager, shelton, ct, ...
12       [finance, manager, firestone, industrial, prod...
13        [us, lcra, cardiovascular, remote, anywhere, in]
14          [gallagher, bassett, corporate, intern, legal]
15                               [us, head, of, marketing]
16                                  [visual, merchandise

In [51]:
title_tagged = [ TaggedDocument(words=sent, tags=[i]) for i, sent in enumerate(title_preproc)]

In [52]:
title_tagged

[TaggedDocument(words=['machine', 'learning', 'engineer'], tags=[0]),
 TaggedDocument(words=['deep', 'learning', 'applied', 'researcher', 'chicago'], tags=[1]),
 TaggedDocument(words=['machine', 'learning', 'engineer'], tags=[2]),
 TaggedDocument(words=['machine', 'learning', 'data', 'scientist'], tags=[3]),
 TaggedDocument(words=['cloud', 'architect'], tags=[4]),
 TaggedDocument(words=['data', 'scientist'], tags=[5]),
 TaggedDocument(words=['store', 'room', 'clerk'], tags=[6]),
 TaggedDocument(words=['director', 'of', 'product', 'us'], tags=[7]),
 TaggedDocument(words=['recruiting', 'manager', 'ad', 'census', 'ext', 'gb'], tags=[8]),
 TaggedDocument(words=['bilingual', 'engineer', 'german', 'germany', 'or', 'switzerland'], tags=[9]),
 TaggedDocument(words=['sommelier'], tags=[10]),
 TaggedDocument(words=['entry', 'level', 'project', 'manager', 'shelton', 'ct', 'based'], tags=[11]),
 TaggedDocument(words=['finance', 'manager', 'firestone', 'industrial', 'products'], tags=[12]),
 Tagged

In [53]:
title_model = Doc2Vec(vector_size=300, window_size=5, min_count=1)
title_model.build_vocab(title_tagged)

In [54]:
title_model.train(title_tagged, total_examples=title_model.corpus_count, epochs=10)

In [55]:
title_model.docvecs.most_similar(0)

[(27487, 0.8619314432144165),
 (9158, 0.8268736600875854),
 (2102, 0.8198314905166626),
 (317, 0.7936148047447205),
 (7600, 0.7785379886627197),
 (30442, 0.7713813781738281),
 (32165, 0.7659890651702881),
 (35076, 0.7473205327987671),
 (7139, 0.7421998977661133),
 (31391, 0.7399638295173645)]

In [65]:
df[df.index == 27487].title.values

array(['Assemblers and Machine Operators'], dtype=object)

In [66]:
df[df.index == 2102].title.values

array(['Industrial Mechanic - Machine Installation'], dtype=object)

In [67]:
df[df.index == 36105].title.values

array(['Research Scientist, Human-Computer Interaction'], dtype=object)

In [68]:
df[df.index == 30442].title.values

array(['Machine Operator'], dtype=object)

## Phrases

In [69]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + sent

In [70]:
title_birgam = Phraser( Phrases(title_preproc, min_count=1, threshold=1) )
ext_title_corp = list(prepare_corpus(title_preproc, title_birgam))

In [71]:
ext_title_tagged = [ TaggedDocument(words=sent, tags=[i]) for i, sent in enumerate(ext_title_corp)]

In [72]:
ext_title_tagged

[TaggedDocument(words=['machine_learning', 'engineer', 'machine', 'learning', 'engineer'], tags=[0]),
 TaggedDocument(words=['deep_learning', 'applied', 'researcher', 'chicago', 'deep', 'learning', 'applied', 'researcher', 'chicago'], tags=[1]),
 TaggedDocument(words=['machine_learning', 'engineer', 'machine', 'learning', 'engineer'], tags=[2]),
 TaggedDocument(words=['machine_learning', 'data_scientist', 'machine', 'learning', 'data', 'scientist'], tags=[3]),
 TaggedDocument(words=['cloud_architect', 'cloud', 'architect'], tags=[4]),
 TaggedDocument(words=['data_scientist', 'data', 'scientist'], tags=[5]),
 TaggedDocument(words=['store', 'room_clerk', 'store', 'room', 'clerk'], tags=[6]),
 TaggedDocument(words=['director_of', 'product', 'us', 'director', 'of', 'product', 'us'], tags=[7]),
 TaggedDocument(words=['recruiting', 'manager_ad', 'census_ext', 'gb', 'recruiting', 'manager', 'ad', 'census', 'ext', 'gb'], tags=[8]),
 TaggedDocument(words=['bilingual', 'engineer', 'german_german

In [73]:
ext_title_model = Doc2Vec(vector_size=300, window_size=5, min_count=1)
ext_title_model.build_vocab(ext_title_tagged)

In [74]:
ext_title_model.train(ext_title_tagged, total_examples=title_model.corpus_count, epochs=10)

In [96]:
most_similar_mle = ext_title_model.docvecs.most_similar(0)
most_similar_mle

[(24281, 0.8714897632598877),
 (6464, 0.8585706949234009),
 (25365, 0.858457088470459),
 (32029, 0.8575447201728821),
 (19929, 0.8523539900779724),
 (35640, 0.8465454578399658),
 (33091, 0.8426761627197266),
 (8750, 0.8415770530700684),
 (8135, 0.839065432548523),
 (34619, 0.835023820400238)]

In [97]:
df[df.index == 0].title.values

array(['Machine Learning Engineer'], dtype=object)

In [98]:
for prediction in most_similar_mle:
    print(df[df.index == prediction[0]].title.values + " with score of: " + str(np.round(prediction[1], decimals=3)))

['Senior Data Engineer - San Francisco with score of: 0.871']
['Data Scientist / Software Engineer with score of: 0.859']
['Data Platform Engineer - San Francisco with score of: 0.858']
['Research Engineer with score of: 0.858']
['Machine Learning Engineer, Infrastructure - San Francisco with score of: 0.852']
['Data Engineer/Developer with score of: 0.847']
['Machine Learning Cloud Engineer with score of: 0.843']
['Senior Software Engineer - Data Mining with score of: 0.842']
['Functional Safety Engineer with score of: 0.839']
['CNC Engineer with score of: 0.835']
