<a href="https://colab.research.google.com/github/SDS-AAU/SDS-master/blob/master/M2/notebooks/W3_job_recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Getting dats from kaggle

In [1]:
# install kaggle package
! pip install -q kaggle

upload `kaggle.json`API key

In [2]:
# make folder for api key
! mkdir ~/.kaggle

In [3]:
# copy key into folder
! cp kaggle.json ~/.kaggle/

In [4]:
# change access permissions
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# check if worked
! kaggle datasets list

for getting data go to kaggle page and ... and copy API command

In [6]:
! kaggle datasets download -d kandij/job-recommendation-datasets

Downloading job-recommendation-datasets.zip to /content
 65% 34.0M/52.4M [00:00<00:00, 157MB/s]
100% 52.4M/52.4M [00:00<00:00, 174MB/s]


In [7]:
! unzip /content/job-recommendation-datasets.zip

Archive:  /content/job-recommendation-datasets.zip
  inflating: Combined_Jobs_Final.csv  
  inflating: Experience.csv          
  inflating: Job_Views.csv           
  inflating: Positions_Of_Interest.csv  
  inflating: job_data.csv            


In [8]:
import pandas as pd
import numpy as np
import spacy

#instantiating English module
nlp = spacy.load('en')

In [9]:
df_jobs = pd.read_csv('/content/Combined_Jobs_Final.csv')
df_views = pd.read_csv('/content/Job_Views.csv')
df_poi = pd.read_csv('/content/Positions_Of_Interest.csv')
df_exp = pd.read_csv('/content/Experience.csv')


In [12]:
df_jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Job.ID              84090 non-null  int64  
 1   Provider            84090 non-null  int64  
 2   Status              84090 non-null  object 
 3   Slug                84090 non-null  object 
 4   Title               84090 non-null  object 
 5   Position            84090 non-null  object 
 6   Company             81819 non-null  object 
 7   City                83955 non-null  object 
 8   State.Name          83919 non-null  object 
 9   State.Code          83919 non-null  object 
 10  Address             36 non-null     object 
 11  Latitude            84090 non-null  float64
 12  Longitude           84090 non-null  float64
 13  Industry            267 non-null    object 
 14  Job.Description     84034 non-null  object 
 15  Requirements        0 non-null      float64
 16  Sala

In [15]:
# concatenate several columns into one text
df_jobs['text'] = df_jobs['Title'].str.cat(df_jobs['Position'].astype(str), sep=' ').str.cat(df_jobs['Company'].astype(str), sep=' ').str.cat(df_jobs['Job.Description'].astype(str), sep=' ')

In [17]:
# progress bar
import tqdm

In [21]:
# how long would it take to run it with plain spacy?

%%time
nlp(df_jobs['text'][0])

CPU times: user 25.4 ms, sys: 844 µs, total: 26.2 ms
Wall time: 30.8 ms


Server @ Tacolicious Server Tacolicious Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working at our restaurant! 

 ● Serve food/drinks to customers in a professional manner 
 ● Act as a cashier when needed 
 ● Clean up the dining space 
 ● Train the new staff 

In [22]:
# run that for all? That is 0.71h - too long
30.8*84000/1000/3600

0.7186666666666666

In [23]:
# run progress bare and clean up using spacy but without some heavy parts of the pipeline

%%time
clean_text = []


pbar = tqdm.tqdm(total=len(df_jobs['text']),position=0, leave=True)

for text in nlp.pipe(df_jobs['text'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(txt)

  pbar.update(1)

100%|█████████▉| 83992/84090 [02:23<00:00, 562.27it/s]

CPU times: user 2min 21s, sys: 1.74 s, total: 2min 23s
Wall time: 2min 23s


In [24]:
df_jobs['clean_text'] = clean_text

In [None]:
df_jobs['clean_text'].isnull().sum()

In [27]:
# update gensim
!pip install --upgrade gensim -q

[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
[?25h

In [28]:
# get tooling for Word2Vec model
from gensim.models import Word2Vec

In [29]:
# enable logging
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# train word2vec model
w2v_model = Word2Vec(sentences=df_jobs['clean_text'], vector_size=300, window=5, min_count=2, workers=2, epochs=5)

In [32]:
w2v_model.wv.similar_by_word('bartender')

[('busser', 0.7291507124900818),
 ('waitress', 0.707244336605072),
 ('hostess', 0.6812504529953003),
 ('waiter', 0.6639885902404785),
 ('finisher', 0.6277411580085754),
 ('bussers', 0.6256389617919922),
 ('bartenders', 0.6182125210762024),
 ('fry', 0.612985372543335),
 ('waitstaff', 0.6093533039093018),
 ('dishwasher', 0.5950363278388977)]

In [37]:
w2v_model.wv.similar_by_word('java')

[('javascript', 0.8004930019378662),
 ('xml', 0.7801821231842041),
 ('jquery', 0.7539994120597839),
 ('python', 0.7529684901237488),
 ('ajax', 0.7412889003753662),
 ('mvc', 0.7264665365219116),
 ('css', 0.7240338325500488),
 ('html', 0.7137460112571716),
 ('unix', 0.7129760384559631),
 ('developer', 0.7123264074325562)]

In [38]:
# check out cosine similarity for some word-vectors
from sklearn.metrics.pairwise import cosine_similarity

In [45]:
pizza = w2v_model.wv['pizza'].reshape(1,300)
pasta = w2v_model.wv['pasta'].reshape(1,300)
sushi = w2v_model.wv['sushi'].reshape(1,300)
uber = w2v_model.wv['uber'].reshape(1,300)

In [48]:
cosine_similarity(pizza, uber)

array([[0.14577591]], dtype=float32)

In [49]:
w2v_model.save('w2v_model')

2021-10-25 09:07:23,786 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'w2v_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-10-25T09:07:23.786697', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-10-25 09:07:23,788 : INFO : not storing attribute cum_table
2021-10-25 09:07:23,922 : INFO : saved w2v_model


In [59]:
# define function for avg-embeddings 
# (in older versions of gensim it's not key_to_index but vocab)

def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in w2v_model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(word2vec_model.wv[words], axis=0)
    else:
        return []

In [None]:
get_mean_vector(w2v_model, df_jobs['clean_text'][0])

In [62]:
# transform all texts into avg-vec-repre
avg_job_vecs = df_jobs['clean_text'].map(lambda t: get_mean_vector(w2v_model, t))

In [64]:
avg_job_vecs = np.vstack(avg_job_vecs)

In [66]:
avg_job_vecs.shape

(84090, 300)

In [67]:
df_jobs['text'][0]

"Server @ Tacolicious Server Tacolicious Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working at our restaurant! \r\n\r\n ● Serve food/drinks to customers in a professional manner \r\n ● Act as a cashier when needed \r\n ● Clean up the dining space \r\n ● Train the new staff \r\n"

In [68]:
sims = cosine_similarity(avg_job_vecs[0].reshape(1,300), avg_job_vecs)

In [83]:
ix = np.flip(np.argsort(sims)).tolist()[0][:10]

In [84]:
df_jobs['text'][ix]

0        Server @ Tacolicious Server Tacolicious Tacoli...
84057    Server @ Kabuto Restaurant Server Kabuto Resta...
13448    Server @ BALEENkitchen Server BALEENkitchen  ●...
24471    Server @ Exotic Thai Cafe Server Exotic Thai C...
84081    Server @ Pizza Antica Server Pizza Antica  ● S...
13453    Server @ Gonpachi Server Gonpachi  ● Serve foo...
84044    Server @ Yuzu Server Yuzu  Yuzu is one of the ...
10783    Server @ La Fontaine Restaurant Server La Font...
10778    Server @ Far Niente Ristorante Server Far Nien...
84082    Server @ Giardino Server Giardino  ● Serve foo...
Name: text, dtype: object

In [93]:
query = get_mean_vector(w2v_model,['sushi','chef','palo','alto'])

In [94]:
sims = cosine_similarity(query.reshape(1,300), avg_job_vecs)
ix = np.flip(np.argsort(sims)).tolist()[0][:10]
df_jobs['text'][ix]

84037    Sushi Chef @ Haku Sushi Sushi Chef Haku Sushi ...
84026    Sushi Chef @ Yoshi's Catering Sushi Chef Yoshi...
84025    Sushi Chef @ Chin's Sushi Bar & Restaurant Sus...
76235    Prep Cook/Dishwasher @ Haku Sushi Prep Cook/Di...
23744    Banquet Servers and Bartenders @ All Team Staf...
4        Kitchen Staff/Chef @ Rosa Mexicano - Sunset Ki...
82777    Server, Bartender, Cook, Dishwasher, Hostess -...
84024    Sushi Chef @ Sushi Taka Sushi Chef Sushi Taka ...
50182    Cook / Sous Chef / Chef @ Classic Staffing Ser...
54391    Chef Instructors @ Star Career Academy Chef In...
Name: text, dtype: object

In [95]:
df_jobs['text'][84037]

"Sushi Chef @ Haku Sushi Sushi Chef Haku Sushi Haku Sushi is Santa Rosa's newest sushi bar. We have 100+ seats and business is great!\r\nWe need a head sushi chef to lead our current team of 3 sushi chefs.\r\nIf you currently are NOT a head sushi chef, this is a great opportunity to move into a head position very very quickly.\r\nWe can talk about job expectations and skills required during the phone interview. Here is more info on Haku:\r\nhttp://707.pressdemocrat.com/2013-05-31/featured/cox-haku-sushi\r\nhttp://www.bohemian.com/northbay/rollin-deep/Content?oid=2419001\r\n\r\nAlso check us out on facebook and Yelp where you can see Haku is the area's best sushi restaurant...full stop."