In [1]:
import pandas as pd

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from ast import literal_eval

In [2]:
ls -lh 'Data/data'

total 165M
-rwxr--r-- 1 witek witek 128M Sep  4 22:53 [0m[01;32mjob_ofer.csv[0m*
-rwxr--r-- 1 witek witek  38M Sep  4 22:54 [01;32mjob_ofer.csv.tar.gz[0m*


In [3]:
df = pd.read_csv('Data/data/job_ofer.csv')
df.shape

(36109, 8)

In [4]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


## Word2Vec

In [5]:
corpus = df['title'].map(simple_preprocess)

In [6]:
corpus.head()

0                     [machine, learning, engineer]
1    [deep, learning, applied, researcher, chicago]
2                     [machine, learning, engineer]
3              [machine, learning, data, scientist]
4                                [cloud, architect]
Name: title, dtype: object

In [7]:
model = Word2Vec(corpus, size=100, window=2, min_count=1)

In [8]:
model.wv.most_similar('machine')

[('deep', 0.9511950016021729),
 ('saving', 0.8791294097900391),
 ('captivate', 0.8485116958618164),
 ('edge', 0.8300778865814209),
 ('acceleration', 0.8287708759307861),
 ('natural', 0.8230324983596802),
 ('personalization', 0.8175762891769409),
 ('nlp', 0.8129515051841736),
 ('computer', 0.8127500414848328),
 ('scientists', 0.8125177621841431)]

## Przykład:

In [9]:
corpus = [
    ['a', 'b', 'c'],
    ['a', 'b', 'x'],
    ['y', 'a', 'b', 'w'],
    ['q', 'a', 'b', 'u', 'k'],
]

bigram = Phraser( Phrases(corpus, min_count=1, threshold=1) )

bigram[['k', 'a', 'b', 'c']]

['k', 'a_b', 'c']

### Title + Phrases:

In [10]:
title_corpus = df['title'].map(simple_preprocess)

title_bigram = Phraser( Phrases(title_corpus, min_count=1, threshold=1))

In [11]:
title_bigram[ simple_preprocess('Deep Learning Applied Researcher - Chicago') ]

['deep_learning', 'applied', 'researcher', 'chicago']

In [12]:
title_corpus_phrase = [title_bigram[sent] for sent in title_corpus]
model = Word2Vec(title_corpus_phrase, size=100, window=2, min_count=1)

In [13]:
model.wv.most_similar('machine')

[('history', 0.9702861309051514),
 ('exciting_opportunity', 0.9691668152809143),
 ('paid', 0.9683700799942017),
 ('na', 0.9683697819709778),
 ('dog', 0.968010425567627),
 ('opportunities_in', 0.9673073291778564),
 ('supply', 0.9672995805740356),
 ('engineer_test', 0.9671096205711365),
 ('officer_based', 0.9671027660369873),
 ('vermont', 0.967066764831543)]

In [16]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + list(sent)

In [17]:
ext_corp = list(prepare_corpus(df['description'], title_bigram))
title_model = Word2Vec(ext_corp, size=100, window=2, min_count=1)

In [19]:
title_model.most_similar('machine')

  """Entry point for launching an IPython kernel.


KeyError: "word 'machine' not in vocabulary"

### Description

In [None]:
#simple_preprocess(df.sample()['description'].values[0])

In [None]:
descr_corpus = df['description'].map(simple_preprocess)
descr_bigram = Phraser( Phrases(descr_corpus, min_count=1, threshold=1))

ext_descr_corp = list(prepare_corpus(descr_corpus, descr_bigram))
descr_model = Word2Vec(ext_descr_corp, size=100, window=2, min_count=1)

In [None]:
descr_model.wv.most_similar('python')

In [None]:
title_model.wv.most_similar('nlp')

In [None]:
for line in df.sample()['description'].map(literal_eval).values[0]:
    print(line)
    print('')

### Visualizations:

In [67]:
import pyLDAvis
from pyLDAvis import gensim

lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

NameError: name 'lda_model' is not defined

## Sections:

In [57]:
sect_descr_corpus = df['description'].map(simple_preprocess)

In [59]:
sect_descr_bigram = Phraser( Phrases(sect_descr_corpus, min_count=5, threshold=100))

In [60]:
sect_ext_descr_corp = list(prepare_corpus(sect_descr_corpus, sect_descr_bigram))

In [61]:
sect_descr_model = Word2Vec(sect_ext_descr_corp, size=100, window=2, min_count=1)

In [65]:
sect_descr_model.most_similar("requirement")

  """Entry point for launching an IPython kernel.


[('requirements', 0.6367955207824707),
 ('required', 0.6236687302589417),
 ('mandatory', 0.47373515367507935),
 ('specifications', 0.46284976601600647),
 ('preferred', 0.4421192407608032),
 ('qualifications', 0.4415714144706726),
 ('qualification', 0.4363916218280792),
 ('desired', 0.42222630977630615),
 ('necessary', 0.41833603382110596),
 ('demands', 0.4157865643501282)]