In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

The **Combined_Jobs_Final.csv** file: has the main jobs data(**title, description, company, etc.**)

The **Job_Views.csv** file: the file with the jobs seeing for the user.

The **Experience.csv**: the file containing the experience from the user.

The **Positions_Of_Interest.csv**: contains the interest the user previously has manifested.

In [22]:
def drop_stopwords(row, language):
    '''
    removes all the stop words in the given row
    input: row | language => english for now
    returns: a list of words without stop words
    '''
    stop_words = set(stopwords.words(language))
    return [word for word in row if word not in stop_words and word not in list(string.punctuation)]

def lemmitize(row):
    '''
    returns the root of each word in the given row
    input: dataframe row
    returns: returns a list of lemmatized words
    '''
    _lemmitize = WordNetLemmatizer()
    return [_lemmitize.lemmatize(word, pos='v') for word in row]

def _data_cleaning(row):
    '''
    returns a clean/lemmitize string
    input: dataframe row
    returns: returns a string
    '''
    row = row.lower().replace('[^a-z \n\.]', ' ')
    row = nltk.word_tokenize(row)
    row = drop_stopwords(row, 'english')
    row = lemmitize(row)
    return ' '.join(row)

In [23]:
jobs = pd.read_csv('Combined_Jobs_Final.csv')
jobs.head()

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC
2,117,1,open,san-francisco-ca-machka-restaurants-corp-barte...,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,California,CA,...,Food and Beverages,We are a popular Mediterranean wine bar and re...,,11.0,,,Part-Time,,2013-07-16 09:34:10 UTC,2014-08-16 15:35:37 UTC
3,121,1,open,brisbane-ca-teriyaki-house-server,Server @ Teriyaki House,Server,Teriyaki House,Brisbane,California,CA,...,Food and Beverages,● Serve food/drinks to customers in a profess...,,10.55,,,Part-Time,,2013-09-04 15:40:30 UTC,2014-08-16 15:35:38 UTC
4,127,1,open,los-angeles-ca-rosa-mexicano-sunset-kitchen-st...,Kitchen Staff/Chef @ Rosa Mexicano - Sunset,Kitchen Staff/Chef,Rosa Mexicano - Sunset,Los Angeles,California,CA,...,Food and Beverages,"Located at the heart of Hollywood, we are one ...",,10.55,,,Part-Time,,2013-07-17 15:26:18 UTC,2014-08-16 15:35:40 UTC


In [4]:
jobs.shape

(84090, 23)

In [5]:
jobs.isna().sum()

Job.ID                    0
Provider                  0
Status                    0
Slug                      0
Title                     0
Position                  0
Company                2271
City                    135
State.Name              171
State.Code              171
Address               84054
Latitude                  0
Longitude                 0
Industry              83823
Job.Description          56
Requirements          84090
Salary                83861
Listing.Start           683
Listing.End             167
Employment.Type          10
Education.Required      267
Created.At                0
Updated.At                0
dtype: int64

In [24]:
# Dropping all the unnecessary columns and renaming them to easier and more readable names
header = ['Job.ID', 'Title', 'Position', 'Company', 'City', 'Employment.Type', 'Job.Description']
jobs = pd.DataFrame(jobs, columns=header)
jobs.columns = ['JobID', 'Title', 'Position', 'Company','City', 'EmploymentType','JobDescription']

jobs.head()

Unnamed: 0,JobID,Title,Position,Company,City,EmploymentType,JobDescription
0,111,Server @ Tacolicious,Server,Tacolicious,Palo Alto,Part-Time,Tacolicious' first Palo Alto store just opened...
1,113,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,Part-Time,\r\n\r\nNew French Brasserie in S.F. Financia...
2,117,Bartender @ Machka Restaurants Corp.,Bartender,Machka Restaurants Corp.,San Francisco,Part-Time,We are a popular Mediterranean wine bar and re...
3,121,Server @ Teriyaki House,Server,Teriyaki House,Brisbane,Part-Time,● Serve food/drinks to customers in a profess...
4,127,Kitchen Staff/Chef @ Rosa Mexicano - Sunset,Kitchen Staff/Chef,Rosa Mexicano - Sunset,Los Angeles,Part-Time,"Located at the heart of Hollywood, we are one ..."


In [7]:
jobs.isna().sum()

JobID                0
Slug                 0
Title                0
Position             0
Company           2271
City               135
EmploymentType      10
JobDescription      56
dtype: int64

In [8]:
jobs_na = jobs.loc[jobs['City'].isna()]
jobs_na['Company'].unique()

array(['St. Francis Hospital', 'CHI Payment Systems',
       'Genesis Health Systems', 'Driveline Retail', 'Volvo Group',
       'Home Instead Senior Care', 'Genesis Health System',
       'Academic Year In America', 'Educational Testing Services',
       'CBS Healthcare Services and Staffing'], dtype=object)

In [9]:
# Googled the headquarter
jobs.loc[jobs.Company == 'St. Francis Hospital', 'City'] = 'New York'
jobs.loc[jobs.Company == 'CHI Payment Systems', 'City'] = 'Edmond'
jobs.loc[jobs.Company == 'Genesis Health Systems', 'City'] = 'Davenport'
jobs.loc[jobs.Company == 'Genesis Health System', 'City'] = 'Davenport'
jobs.loc[jobs.Company == 'Driveline Retail', 'City'] = 'Coppell'
jobs.loc[jobs.Company == 'Volvo Group', 'City'] = 'Washington'
jobs.loc[jobs.Company == 'Home Instead Senior Care', 'City'] = 'Nebraska'
jobs.loc[jobs.Company == 'Academic Year In America', 'City'] = 'Stamford'
jobs.loc[jobs.Company == 'Educational Testing Services', 'City'] = 'Princeton'
jobs.loc[jobs.Company == 'CBS Healthcare Services and Staffing', 'City'] = 'Urbandale'

# Mistake in the company name
jobs['Company'] = jobs['Company'].replace(['Genesis Health Systems'], 'Genesis Health System')

In [10]:
jobs.isnull().sum()

JobID                0
Slug                 0
Title                0
Position             0
Company           2271
City                 0
EmploymentType      10
JobDescription      56
dtype: int64

In [11]:
jobs_na = jobs[(pd.isnull(jobs.EmploymentType))]
jobs_na.head(10)

Unnamed: 0,JobID,Slug,Title,Position,Company,City,EmploymentType,JobDescription
10768,153197,san-francisco-ca-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,San Francisco,,Uber is changing the way the world moves. From...
10769,153198,los-angeles-ca-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Los Angeles,,Uber is changing the way the world moves. From...
10770,153199,chicago-il-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Chicago,,Uber is changing the way the world moves. From...
10771,153200,boston-ma-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Boston,,Uber is changing the way the world moves. From...
10772,153201,ann-arbor-mi-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Ann Arbor,,Uber is changing the way the world moves. From...
10773,153202,oklahoma-ok-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Oklahoma,,Uber is changing the way the world moves. From...
10774,153203,omaha-ne-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Omaha,,Uber is changing the way the world moves. From...
10775,153204,lincoln-ne-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Lincoln,,Uber is changing the way the world moves. From...
10776,153205,minneapolis-mn-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,Minneapolis,,Uber is changing the way the world moves. From...
10777,153206,st-paul-mn-uber-driving-partner,Driving Partner @ Uber,Driving Partner,Uber,St. Paul,,Uber is changing the way the world moves. From...


In [25]:
jobs.EmploymentType = jobs['EmploymentType'].fillna('Full-Time/Part-Time')
jobs.isna().sum()

JobID                0
Title                0
Position             0
Company           2271
City               135
EmploymentType       0
JobDescription      56
dtype: int64

In [26]:
selected_columns = ['JobID', 'Title']
jobs.to_csv('jobs_title.csv', columns=selected_columns, index=False)

---
#### Corpus
I create a corpus from some of the columns.
['Position',
 'Company',
 'City',
 'EmploymentType',
 'JobDescription']

In [27]:
list(jobs)

['JobID',
 'Title',
 'Position',
 'Company',
 'City',
 'EmploymentType',
 'JobDescription']

In [28]:
jobs['Corpus'] = jobs['Position'].map(str)+' '+jobs.Company+' '+jobs.City+' '+jobs.EmploymentType+' '+jobs.JobDescription+' '+jobs.Title
jobs.Corpus.head()

0    Server Tacolicious Palo Alto Part-Time Tacolic...
1    Kitchen Staff/Chef Claude Lane San Francisco P...
2    Bartender Machka Restaurants Corp. San Francis...
3    Server Teriyaki House Brisbane Part-Time  ● Se...
4    Kitchen Staff/Chef Rosa Mexicano - Sunset Los ...
Name: Corpus, dtype: object

In [30]:
jobs = jobs.drop(['Title', 'Position', 'Company', 'City', 'EmploymentType', 'JobDescription',], axis=1).fillna(' ')
jobs.head()

Unnamed: 0,JobID,Corpus
0,111,Server Tacolicious Palo Alto Part-Time Tacolic...
1,113,Kitchen Staff/Chef Claude Lane San Francisco P...
2,117,Bartender Machka Restaurants Corp. San Francis...
3,121,Server Teriyaki House Brisbane Part-Time ● Se...
4,127,Kitchen Staff/Chef Rosa Mexicano - Sunset Los ...


In [31]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84090 entries, 0 to 84089
Data columns (total 2 columns):
JobID     84090 non-null int64
Corpus    84090 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [32]:
jobs.isna().sum()

JobID     0
Corpus    0
dtype: int64

In [33]:
# finally cleaning and preparing the data
jobs.Corpus = jobs['Corpus'].map(str).apply(_data_cleaning)
jobs.Corpus.head()

0    server tacolicious palo alto part-time tacolic...
1    kitchen staff/chef claude lane san francisco p...
2    bartender machka restaurants corp. san francis...
3    server teriyaki house brisbane part-time ● ser...
4    kitchen staff/chef rosa mexicano sunset los an...
Name: Corpus, dtype: object

In [34]:
jobs = jobs.sort_values(by='JobID')
jobs.head()

Unnamed: 0,JobID,Corpus
66565,3,customer service bayer healthcare cincinnati p...
50456,28,kitchen staff/chef pacific catch san francisco...
66566,30,bartender dave 's american bistro olney part-t...
83985,33,server oakland part-time locate oaklandâ€™s ja...
83992,35,kitchen staff skool san francisco part-time fe...


In [35]:
jobs.to_csv('jobs_df.csv', index=False)