In [26]:
import numpy as np
import pandas as pd
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

The **Combined_Jobs_Final.csv** file: has the main jobs data(**title, description, company, etc.**)

The **Job_Views.csv** file: the file with the jobs seeing for the user.

The **Experience.csv**: the file containing the experience from the user.

The **Positions_Of_Interest.csv**: contains the interest the user previously has manifested.

In [27]:
def drop_stopwords(row, language):
    '''
    removes all the stop words in the given row
    input: row | language => english for now
    returns: a list of words without stop words
    '''
    stop_words = set(stopwords.words(language))
    return [word for word in row if word not in stop_words and word not in list(string.punctuation)]

def lemmitize(row):
    '''
    returns the root of each word in the given row
    input: dataframe row
    returns: returns a list of lemmatized words
    '''
    _lemmitize = WordNetLemmatizer()
    return [_lemmitize.lemmatize(word, pos='v') for word in row]

def _data_cleaning(row):
    '''
    returns a clean/lemmitize string
    input: dataframe row
    returns: returns a string
    '''
    row = row.lower().replace('[^a-z \n\.]', ' ')
    row = nltk.word_tokenize(row)
    row = drop_stopwords(row, 'english')
    row = lemmitize(row)
    return ' '.join(row)

---
## Job Views data

In [28]:
jViews = pd.read_csv('Job_Views.csv')
print(jViews.shape)
jViews.head()

(12370, 14)


Unnamed: 0,Applicant.ID,Job.ID,Title,Position,Company,City,State.Name,State.Code,Industry,View.Start,View.End,View.Duration,Created.At,Updated.At
0,10000,73666,Cashiers & Valets Needed! @ WallyPark,Cashiers & Valets Needed!,WallyPark,Newark,New Jersey,NJ,,2014-12-12 20:12:35 UTC,2014-12-12 20:31:24 UTC,1129.0,2014-12-12 20:12:35 UTC,2014-12-12 20:12:35 UTC
1,10000,96655,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's,Garden City,New York,NY,,2014-12-12 20:08:50 UTC,2014-12-12 20:10:15 UTC,84.0,2014-12-12 20:08:50 UTC,2014-12-12 20:08:50 UTC
2,10001,84141,Part Time Showroom Sales / Cashier @ Grizzly I...,Part Time Showroom Sales / Cashier,Grizzly Industrial Inc.,Bellingham,Washington,WA,,2014-12-12 20:12:32 UTC,2014-12-12 20:17:18 UTC,286.0,2014-12-12 20:12:32 UTC,2014-12-12 20:12:32 UTC
3,10002,77989,Event Specialist Part Time @ Advantage Sales &...,Event Specialist Part Time,Advantage Sales & Marketing,Simpsonville,South Carolina,SC,,2014-12-12 20:39:23 UTC,2014-12-12 20:42:13 UTC,170.0,2014-12-12 20:39:23 UTC,2014-12-12 20:39:23 UTC
4,10002,69568,Bonefish - Kitchen Staff @ Bonefish Grill,Bonefish - Kitchen Staff,Bonefish Grill,Greenville,South Carolina,SC,,2014-12-12 20:43:25 UTC,2014-12-12 20:43:58 UTC,33.0,2014-12-12 20:43:25 UTC,2014-12-12 20:43:25 UTC


In [29]:
jViews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12370 entries, 0 to 12369
Data columns (total 14 columns):
Applicant.ID     12370 non-null int64
Job.ID           12370 non-null int64
Title            12370 non-null object
Position         12370 non-null object
Company          11790 non-null object
City             12370 non-null object
State.Name       12348 non-null object
State.Code       12348 non-null object
Industry         2199 non-null object
View.Start       12370 non-null object
View.End         10575 non-null object
View.Duration    10575 non-null float64
Created.At       12370 non-null object
Updated.At       12370 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 1.3+ MB


In [30]:
list(jViews)

['Applicant.ID',
 'Job.ID',
 'Title',
 'Position',
 'Company',
 'City',
 'State.Name',
 'State.Code',
 'Industry',
 'View.Start',
 'View.End',
 'View.Duration',
 'Created.At',
 'Updated.At']

In [31]:
jViews = jViews.drop(['Job.ID', 'Title', 'State.Name', 'State.Code', 'Industry', 'View.Start', 'View.End', 'View.Duration', 'Created.At', 'Updated.At'], axis=1)
jViews.head()

Unnamed: 0,Applicant.ID,Position,Company,City
0,10000,Cashiers & Valets Needed!,WallyPark,Newark
1,10000,Macy's Seasonal Retail Fragrance Cashier - Ga...,Macy's,Garden City
2,10001,Part Time Showroom Sales / Cashier,Grizzly Industrial Inc.,Bellingham
3,10002,Event Specialist Part Time,Advantage Sales & Marketing,Simpsonville
4,10002,Bonefish - Kitchen Staff,Bonefish Grill,Greenville


In [32]:
jViews.isna().sum()

Applicant.ID      0
Position          0
Company         580
City              0
dtype: int64

In [33]:
jViews['jViewCorpus'] = jViews['Position'].map(str) +' '+jViews["Company"] +"  "+ jViews["City"]
jViews = jViews.drop(['Position', 'Company', 'City'], axis=1)
jViews.jViewCorpus = jViews['jViewCorpus'].map(str).apply(_data_cleaning)
jViews.columns = ['ApplicantID', 'jViewCorpus']
jViews.head()

Unnamed: 0,ApplicantID,jViewCorpus
0,10000,cashier valet need wallypark newark
1,10000,macy 's seasonal retail fragrance cashier gard...
2,10001,part time showroom sales cashier grizzly indus...
3,10002,event specialist part time advantage sales mar...
4,10002,bonefish kitchen staff bonefish grill greenville


In [34]:
jViews.to_csv('jView_df.csv')

---
## Experience data

In [42]:
experience = pd.read_csv('experience.csv')
experience.head()

Unnamed: 0,Applicant.ID,Position.Name,Employer.Name,City,State.Name,State.Code,Start.Date,End.Date,Job.Description,Salary,Can.Contact.Employer,Created.At,Updated.At
0,10001,Account Manager / Sales Administration / Quali...,Barcode Resourcing,Bellingham,Washington,WA,2012-10-15,,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC
1,10001,Electronics Technician / Item Master Controller,Ryzex Group,Bellingham,Washington,WA,2001-12-01,2012-04-01,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC
2,10001,Machine Operator,comptec inc,Custer,Washington,WA,1997-01-01,1999-01-01,,,,2014-12-12 20:10:02 UTC,2014-12-12 20:10:02 UTC
3,10003,maintenance technician,Winn residental,washington,District of Columbia,DC,,,"Necessary maintenance for ""Make Ready"" Plumbin...",10.0,False,2014-12-12 21:27:05 UTC,2014-12-12 21:27:05 UTC
4,10003,Electrical Helper,michael and son services,alexandria,Virginia,VA,,,repair and services of electrical construction,,False,2014-12-12 21:27:05 UTC,2014-12-12 21:27:05 UTC


In [36]:
experience.isna().sum()

Applicant.ID               0
Position.Name            950
Employer.Name             33
City                    3731
State.Name              4058
State.Code              4058
Start.Date              2035
End.Date                3747
Job.Description         2961
Salary                  5855
Can.Contact.Employer    5072
Created.At                 0
Updated.At                 0
dtype: int64

In [37]:
list(experience)

['Applicant.ID',
 'Position.Name',
 'Employer.Name',
 'City',
 'State.Name',
 'State.Code',
 'Start.Date',
 'End.Date',
 'Job.Description',
 'Salary',
 'Can.Contact.Employer',
 'Created.At',
 'Updated.At']

In [43]:
experience = experience.drop(['Employer.Name', 'City', 'State.Name', 'State.Code', 'Start.Date', 'End.Date', 'Job.Description', 'Salary', 'Can.Contact.Employer', 'Created.At', 'Updated.At'], axis=1)
experience.columns = ['ApplicantID', 'Position']
experience.Position = experience.Position.fillna(' ')
experience.head()

Unnamed: 0,ApplicantID,Position
0,10001,Account Manager / Sales Administration / Quali...
1,10001,Electronics Technician / Item Master Controller
2,10001,Machine Operator
3,10003,maintenance technician
4,10003,Electrical Helper


As we see, there are sometimes more than one application for the applicant. Those will be marged to one.

In [45]:
experience = experience.groupby('ApplicantID', sort=False)['Position'].apply(' '.join).reset_index()
experience.Position = experience['Position'].map(str).apply(_data_cleaning)
experience = experience.sort_values(by='ApplicantID')
experience.head(10)

Unnamed: 0,ApplicantID,Position
0,2,writer uloop blog volunteer
1,3,prep cook server market intern
2,6,project assistant
3,8,deli clerk server cashier food prep order taker
4,11,cashier
5,12,server rec leader 1 program management intern
6,13,assistant offica assistant cashier
7,14,waitress host lifeguard
8,15,
9,18,barista/cashier server receptionist


In [15]:
experience.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3790 entries, 0 to 3789
Data columns (total 2 columns):
ApplicantID    3790 non-null int64
Position       3790 non-null object
dtypes: int64(1), object(1)
memory usage: 59.3+ KB


In [46]:
experience.to_csv('experience_df.csv')

---
## Position of Interest

In [47]:
poi = pd.read_csv('Positions_Of_Interest.csv')
poi = poi.sort_values(by='Applicant.ID')
poi.head()

Unnamed: 0,Applicant.ID,Position.Of.Interest,Created.At,Updated.At
6437,96,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
1156,153,Barista,2014-08-14 15:56:43 UTC,2015-02-18 02:35:06 UTC
1155,153,Host,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
1154,153,Server,2014-08-14 15:56:42 UTC,2015-02-26 20:35:12 UTC
1158,153,Sales Rep,2014-08-14 15:56:47 UTC,2015-03-02 02:13:08 UTC


In [48]:
poi = poi.drop(['Created.At', 'Updated.At'], axis=1).fillna(' ')
poi.columns = ['ApplicantID', 'POI']
poi.head(10)

Unnamed: 0,ApplicantID,POI
6437,96,Server
1156,153,Barista
1155,153,Host
1154,153,Server
1158,153,Sales Rep
1157,153,Customer Service Rep
1952,256,Host
1957,256,Production Area
1956,256,Sales Rep
1955,256,Customer Service Rep


As we see, there are sometimes more than one position of intrest. Those will be marged to one.

In [49]:
poi.POI = poi['POI'].map(str).apply(_data_cleaning)
poi = poi.groupby('ApplicantID', sort=False)['POI'].apply(' '.join).reset_index()
poi.head(10)

Unnamed: 0,ApplicantID,POI
0,96,server
1,153,barista host server sales rep customer service...
2,256,host production area sales rep customer servic...
3,438,customer service rep barista host server
4,568,receptionist customer service rep book keeper
5,601,customer service rep server line cook
6,867,driver customer service rep book keeper chef
7,938,host receptionist customer service rep server
8,1210,line cook host train community relations custo...
9,1251,fine arts retail part-time travel


In [50]:
poi.to_csv('poi_df.csv')

---
# Marge DataFrames

In [65]:
user = jViews.merge(experience, how='outer', left_on='ApplicantID', right_on='ApplicantID').fillna(' ')
user = user.sort_values(by='ApplicantID')
user.head()

Unnamed: 0,ApplicantID,jViewCorpus,Position
12370,2,,writer uloop blog volunteer
12371,3,,prep cook server market intern
12372,6,,project assistant
12373,8,,deli clerk server cashier food prep order taker
12374,11,,cashier


In [66]:
user = user.merge(poi, how='outer', left_on='ApplicantID', right_on='ApplicantID').fillna('')
user = user.sort_values(by='ApplicantID')
user.head(10)

Unnamed: 0,ApplicantID,jViewCorpus,Position,POI
0,2,,writer uloop blog volunteer,
1,3,,prep cook server market intern,
2,6,,project assistant,
3,8,,deli clerk server cashier food prep order taker,
4,11,,cashier,
5,12,,server rec leader 1 program management intern,
6,13,,assistant offica assistant cashier,
7,14,,waitress host lifeguard,
8,15,,,
9,18,,barista/cashier server receptionist,


In [67]:
user['Corpus'] = user['jViewCorpus'].map(str)+' '+user.Position+' '+user.POI
user = user.drop(['jViewCorpus', 'Position', 'POI'], axis=1).fillna(' ')
user.columns = ['ApplicantID', 'Corpus']

print(user.shape)
user.head()

(15959, 2)


Unnamed: 0,ApplicantID,Corpus
0,2,writer uloop blog volunteer
1,3,prep cook server market intern
2,6,project assistant
3,8,deli clerk server cashier food prep order ta...
4,11,cashier


In [68]:
# drop all the rows with a empty corpus.
user = user.drop(user[user.Corpus == ' '].index, axis=0)
user.Corpus = user['Corpus'].apply(_data_cleaning)

print(user.shape)
user.head()

(15959, 2)


Unnamed: 0,ApplicantID,Corpus
0,2,writer uloop blog volunteer
1,3,prep cook server market intern
2,6,project assistant
3,8,deli clerk server cashier food prep order taker
4,11,cashier


In [69]:
user.to_csv('user_df.csv', index=False)