In [60]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

The **Combined_Jobs_Final.csv** file: has the main jobs data(**title, description, company, etc.**)

The **Job_Views.csv** file: the file with the jobs seeing for the user.

The **Experience.csv**: the file containing the experience from the user.

The **Positions_Of_Interest.csv**: contains the interest the user previously has manifested.

In [61]:
jobs = pd.read_csv('Combined_Jobs_Final.csv')
jobs.head()

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC
2,117,1,open,san-francisco-ca-machka-restaurants-corp-barte...,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,California,CA,...,Food and Beverages,We are a popular Mediterranean wine bar and re...,,11.0,,,Part-Time,,2013-07-16 09:34:10 UTC,2014-08-16 15:35:37 UTC
3,121,1,open,brisbane-ca-teriyaki-house-server,Server @ Teriyaki House,Server,Teriyaki House,Brisbane,California,CA,...,Food and Beverages,● Serve food/drinks to customers in a profess...,,10.55,,,Part-Time,,2013-09-04 15:40:30 UTC,2014-08-16 15:35:38 UTC
4,127,1,open,los-angeles-ca-rosa-mexicano-sunset-kitchen-st...,Kitchen Staff/Chef @ Rosa Mexicano - Sunset,Kitchen Staff/Chef,Rosa Mexicano - Sunset,Los Angeles,California,CA,...,Food and Beverages,"Located at the heart of Hollywood, we are one ...",,10.55,,,Part-Time,,2013-07-17 15:26:18 UTC,2014-08-16 15:35:40 UTC


In [62]:
jobs.shape

(84090, 23)

In [63]:
jobs.isna().sum()

Job.ID                    0
Provider                  0
Status                    0
Slug                      0
Title                     0
Position                  0
Company                2271
City                    135
State.Name              171
State.Code              171
Address               84054
Latitude                  0
Longitude                 0
Industry              83823
Job.Description          56
Requirements          84090
Salary                83861
Listing.Start           683
Listing.End             167
Employment.Type          10
Education.Required      267
Created.At                0
Updated.At                0
dtype: int64

In [64]:
# Dropping all the unnecessary columns and renaming them to easier and more readable names
jobs = pd.DataFrame(jobs, columns=['Job.ID', 'Slug', 'Title', 'Position', 'Company', 'City', 'Employment.Type', 'Job.Description'])
jobs.columns = ['JobID', 'Slug', 'Title', 'Position', 'Company','City', 'EmploymentType','JobDescription']
jobs.head()

Unnamed: 0,JobID,Slug,Title,Position,Company,City,EmploymentType,JobDescription
0,111,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,Part-Time,Tacolicious' first Palo Alto store just opened...
1,113,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,Part-Time,\r\n\r\nNew French Brasserie in S.F. Financia...
2,117,san-francisco-ca-machka-restaurants-corp-barte...,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,Part-Time,We are a popular Mediterranean wine bar and re...
3,121,brisbane-ca-teriyaki-house-server,Server @ Teriyaki House,Server,Teriyaki House,Brisbane,Part-Time,● Serve food/drinks to customers in a profess...
4,127,los-angeles-ca-rosa-mexicano-sunset-kitchen-st...,Kitchen Staff/Chef @ Rosa Mexicano - Sunset,Kitchen Staff/Chef,Rosa Mexicano - Sunset,Los Angeles,Part-Time,"Located at the heart of Hollywood, we are one ..."


In [65]:
jobs.isna().sum()

JobID                0
Slug                 0
Title                0
Position             0
Company           2271
City               135
EmploymentType      10
JobDescription      56
dtype: int64

In [66]:
jobs_na = jobs.loc[jobs['City'].isna()]
jobs_na['Company'].unique()

array(['St. Francis Hospital', 'CHI Payment Systems',
       'Genesis Health Systems', 'Driveline Retail', 'Volvo Group',
       'Home Instead Senior Care', 'Genesis Health System',
       'Academic Year In America', 'Educational Testing Services',
       'CBS Healthcare Services and Staffing'], dtype=object)

In [67]:
# Googled the headquarter
jobs.loc[jobs.Company == 'St. Francis Hospital', 'City'] = 'New York'
jobs.loc[jobs.Company == 'CHI Payment Systems', 'City'] = 'Edmond'
jobs.loc[jobs.Company == 'Genesis Health Systems', 'City'] = 'Davenport'
jobs.loc[jobs.Company == 'Genesis Health System', 'City'] = 'Davenport'
jobs.loc[jobs.Company == 'Driveline Retail', 'City'] = 'Coppell'
jobs.loc[jobs.Company == 'Volvo Group', 'City'] = 'Washington'
jobs.loc[jobs.Company == 'Home Instead Senior Care', 'City'] = 'Nebraska'
jobs.loc[jobs.Company == 'Academic Year In America', 'City'] = 'Stamford'
jobs.loc[jobs.Company == 'Educational Testing Services', 'City'] = 'Princeton'
jobs.loc[jobs.Company == 'CBS Healthcare Services and Staffing', 'City'] = 'Urbandale'

# Mistake in the company name
jobs['Company'] = jobs['Company'].replace(['Genesis Health Systems'], 'Genesis Health System')

In [68]:
jobs.isnull().sum()

JobID                0
Slug                 0
Title                0
Position             0
Company           2271
City                 0
EmploymentType      10
JobDescription      56
dtype: int64

In [69]:
jobs_na = jobs[(pd.isnull(jobs.EmploymentType))]
jobs_na.head(10)

Unnamed: 0,JobID,Slug,Title,Position,Company,City,EmploymentType,JobDescription
10768,153197,san-francisco-ca-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,San Francisco,,Uber is changing the way the world moves. From...
10769,153198,los-angeles-ca-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Los Angeles,,Uber is changing the way the world moves. From...
10770,153199,chicago-il-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Chicago,,Uber is changing the way the world moves. From...
10771,153200,boston-ma-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Boston,,Uber is changing the way the world moves. From...
10772,153201,ann-arbor-mi-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Ann Arbor,,Uber is changing the way the world moves. From...
10773,153202,oklahoma-ok-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Oklahoma,,Uber is changing the way the world moves. From...
10774,153203,omaha-ne-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Omaha,,Uber is changing the way the world moves. From...
10775,153204,lincoln-ne-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Lincoln,,Uber is changing the way the world moves. From...
10776,153205,minneapolis-mn-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Minneapolis,,Uber is changing the way the world moves. From...
10777,153206,st-paul-mn-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,St. Paul,,Uber is changing the way the world moves. From...


In [72]:
jobs.EmploymentType = jobs['EmploymentType'].fillna('Full-Time/Part-Time')
jobs.isna().sum()

JobID                0
Slug                 0
Title                0
Position             0
Company           2271
City                 0
EmploymentType       0
JobDescription      56
dtype: int64

In [74]:
selected_columns = ['JobID', 'Title']
jobs.to_csv('jobs_title.csv', columns=selected_columns, index=False)

---
#### Corpus
I create a corpus from some of the columns.
['Position',
 'Company',
 'City',
 'EmploymentType',
 'JobDescription']

In [40]:
list(jobs)

['JobID',
 'Slug',
 'Title',
 'Position',
 'Company',
 'City',
 'EmploymentType',
 'JobDescription']

In [41]:
jobs['Corpus'] = jobs['Position'].map(str)+' '+jobs.Slug+' '+jobs.Company+' '+jobs.City+' '+jobs.EmploymentType+' '+jobs.JobDescription
jobs.Corpus.head()

0    Server palo-alto-ca-tacolicious-server Tacolic...
1    Kitchen Staff/Chef san-francisco-ca-claude-lan...
2    Bartender san-francisco-ca-machka-restaurants-...
3    Server brisbane-ca-teriyaki-house-server Teriy...
4    Kitchen Staff/Chef los-angeles-ca-rosa-mexican...
Name: Corpus, dtype: object

In [42]:
jobs = jobs.drop(['Title', 'Slug', 'Position', 'Company', 'City', 'EmploymentType', 'JobDescription',], axis=1).fillna(' ')
jobs.head()

Unnamed: 0,JobID,Corpus
0,111,Server palo-alto-ca-tacolicious-server Tacolic...
1,113,Kitchen Staff/Chef san-francisco-ca-claude-lan...
2,117,Bartender san-francisco-ca-machka-restaurants-...
3,121,Server brisbane-ca-teriyaki-house-server Teriy...
4,127,Kitchen Staff/Chef los-angeles-ca-rosa-mexican...


In [43]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 2 columns):
JobID     84090 non-null int64
Corpus    84090 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [44]:
jobs.isna().sum()

JobID     0
Corpus    0
dtype: int64

In [45]:
jobs.Corpus = jobs.Corpus.str.replace('[^a-zA-Z ]', ' ')
jobs.Corpus.head()

0    Server palo alto ca tacolicious server Tacolic...
1    Kitchen Staff Chef san francisco ca claude lan...
2    Bartender san francisco ca machka restaurants ...
3    Server brisbane ca teriyaki house server Teriy...
4    Kitchen Staff Chef los angeles ca rosa mexican...
Name: Corpus, dtype: object

In [46]:
jobs.Corpus = jobs.Corpus.str.lower()
jobs.Corpus.head()

0    server palo alto ca tacolicious server tacolic...
1    kitchen staff chef san francisco ca claude lan...
2    bartender san francisco ca machka restaurants ...
3    server brisbane ca teriyaki house server teriy...
4    kitchen staff chef los angeles ca rosa mexican...
Name: Corpus, dtype: object

In [47]:
# Tokenizing
def tokenizer(row):
    '''
    Will tokenize the given text
    input: dataframe row
    returns: a tokenized row
    '''
    return nltk.word_tokenize(row)

In [48]:
jobs.Corpus = jobs['Corpus'].apply(lambda row: tokenizer(row))
jobs.head()

Unnamed: 0,JobID,Corpus
0,111,"[server, palo, alto, ca, tacolicious, server, ..."
1,113,"[kitchen, staff, chef, san, francisco, ca, cla..."
2,117,"[bartender, san, francisco, ca, machka, restau..."
3,121,"[server, brisbane, ca, teriyaki, house, server..."
4,127,"[kitchen, staff, chef, los, angeles, ca, rosa,..."


In [49]:
def drop_stopwords(row, language):
    '''
    removes all the stop words in the given row
    input: row | language => english for now
    returns: a list of words without stop words
    '''
    stop_words = set(stopwords.words(language))
    return [word for word in row if word not in stop_words]

In [50]:
jobs.Corpus = jobs['Corpus'].apply(lambda row: drop_stopwords(row, 'english'))
jobs.head()

Unnamed: 0,JobID,Corpus
0,111,"[server, palo, alto, ca, tacolicious, server, ..."
1,113,"[kitchen, staff, chef, san, francisco, ca, cla..."
2,117,"[bartender, san, francisco, ca, machka, restau..."
3,121,"[server, brisbane, ca, teriyaki, house, server..."
4,127,"[kitchen, staff, chef, los, angeles, ca, rosa,..."


In [51]:
def stemming(row):
    '''
    returns the root of each word in the given row
    input: dataframe row
    returns: returns a list of stemmed words
    '''
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in row]

In [52]:
jobs.Corpus = jobs['Corpus'].apply(lambda row: stemming(row))
jobs.head()

Unnamed: 0,JobID,Corpus
0,111,"[server, palo, alto, ca, tacolici, server, tac..."
1,113,"[kitchen, staff, chef, san, francisco, ca, cla..."
2,117,"[bartend, san, francisco, ca, machka, restaur,..."
3,121,"[server, brisban, ca, teriyaki, hous, server, ..."
4,127,"[kitchen, staff, chef, lo, angel, ca, rosa, me..."


In [53]:
def rejoin_words(row):
    '''
    joins all the words in the list
    input: dataframe row
    returns: a single string for each row (list)
    '''
    return (' '.join(row))

In [54]:
jobs.Corpus = jobs['Corpus'].apply(lambda row: rejoin_words(row))

In [57]:
jobs = jobs.sort_values(by='JobID')
jobs.head()

Unnamed: 0,JobID,Corpus
66565,3,custom servic cincinnati oh bayer healthcar cu...
50456,28,kitchen staff chef san francisco ca pacif catc...
66566,30,bartend olney md dave american bistro bartend ...
83985,33,server oakland ca server oakland part time loc...
83992,35,kitchen staff san francisco ca skool kitchen s...


In [56]:
jobs.to_csv('jobs_df.csv', index=False)