In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from collections import Counter
import time

import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Import and preview data
It came from here: https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/word2vec_craigslistjobtitles.ipynb

In [2]:
cl = pd.read_csv("https://raw.githubusercontent.com/h2oai/sparkling-water/rel-1.6/examples/smalldata/craigslistJobTitles.csv",
                encoding='ISO-8859-1')

In [3]:
cl.head(10)

Unnamed: 0,category,jobtitle
0,education,After School Supervisor
1,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE..."
2,education,Bay Area Family Recruiter
3,education,Adult Day Programs/Community Access/Job Coaches
4,education,General Counselor - Non Tenure track
5,education,Part-Time Summer Math Teachers/Tutors
6,education,Preschool Teacher (temp-to-hire)
7,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE..."
8,education,Private Teachers and Tutors Needed in the Sout...
9,education,Art Therapist at Esther B. Clark School


In [4]:
cl.shape

(13845, 2)

In [5]:
cl.category.value_counts()

labor              2500
administrative     2500
foodbeverage       2495
education          2438
customerservice    2319
accounting         1593
Name: category, dtype: int64

## Data cleaning / processing / langauge parsing

In [6]:
nlp = spacy.load('en')

In [7]:
## Attempt to nlp all text without cleaning

nlp_col = []
for row in cl.index:
    nlp_list = []
    nlp_text = nlp(cl.loc[row, 'jobtitle'].lower())
    for token in nlp_text:
        nlp_list.append(token)
    nlp_col.append(nlp_list)

cl['jobtitle_nlp'] = nlp_col

In [8]:
cl

Unnamed: 0,category,jobtitle,jobtitle_nlp
0,education,After School Supervisor,"[after, school, supervisor]"
1,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...","[*, *, *, *, *, tutors, needed, -, for, all, s..."
2,education,Bay Area Family Recruiter,"[bay, area, family, recruiter]"
3,education,Adult Day Programs/Community Access/Job Coaches,"[adult, day, programs, /, community, access, /..."
4,education,General Counselor - Non Tenure track,"[general, counselor, -, non, tenure, track]"
5,education,Part-Time Summer Math Teachers/Tutors,"[part, -, time, summer, math, teachers, /, tut..."
6,education,Preschool Teacher (temp-to-hire),"[preschool, teacher, (, temp, -, to, -, hire, )]"
7,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...","[*, *, *, *, *, tutors, needed, -, for, all, s..."
8,education,Private Teachers and Tutors Needed in the Sout...,"[private, teachers, and, tutors, needed, in, t..."
9,education,Art Therapist at Esther B. Clark School,"[art, therapist, at, esther, b., clark, school]"


In [9]:
## Remove all punctuation because not needed for this situation
## Remove stopwords too

no_punct = []
for row in cl.index:
    text = cl.loc[row, 'jobtitle_nlp']
    no_punct_text = []
    for token in text:
        if not token.is_punct and not token.is_stop:
            no_punct_text.append(token)
    no_punct.append(no_punct_text)
    
cl['job_clean'] = no_punct

In [10]:
cl.head(20)

Unnamed: 0,category,jobtitle,jobtitle_nlp,job_clean
0,education,After School Supervisor,"[after, school, supervisor]","[school, supervisor]"
1,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...","[*, *, *, *, *, tutors, needed, -, for, all, s...","[tutors, needed, subjects, ages]"
2,education,Bay Area Family Recruiter,"[bay, area, family, recruiter]","[bay, area, family, recruiter]"
3,education,Adult Day Programs/Community Access/Job Coaches,"[adult, day, programs, /, community, access, /...","[adult, day, programs, community, access, job,..."
4,education,General Counselor - Non Tenure track,"[general, counselor, -, non, tenure, track]","[general, counselor, non, tenure, track]"
5,education,Part-Time Summer Math Teachers/Tutors,"[part, -, time, summer, math, teachers, /, tut...","[time, summer, math, teachers, tutors]"
6,education,Preschool Teacher (temp-to-hire),"[preschool, teacher, (, temp, -, to, -, hire, )]","[preschool, teacher, temp, hire]"
7,education,"*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...","[*, *, *, *, *, tutors, needed, -, for, all, s...","[tutors, needed, subjects, ages]"
8,education,Private Teachers and Tutors Needed in the Sout...,"[private, teachers, and, tutors, needed, in, t...","[private, teachers, tutors, needed, south, bay]"
9,education,Art Therapist at Esther B. Clark School,"[art, therapist, at, esther, b., clark, school]","[art, therapist, esther, b., clark, school]"


In [11]:
cl.loc[6, 'job_clean'][1].lemma_

'teacher'

In [12]:
## Won't lemmatize at the moment because it doesn't take care of plurals
## Could use later to see if it improves model

In [13]:
## Train test set now

x = cl['job_clean']
y = cl['category']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=98,
                                                   stratify=y)
training_data = pd.DataFrame({'category': y_train,
                              'jobtitle': x_train})

## Create features

### Bag of Words

In [14]:
## Split into categories. Find the top 100 words for each category

categories = list(y_train.unique())
categories

['foodbeverage',
 'labor',
 'administrative',
 'customerservice',
 'education',
 'accounting']

In [15]:
## Find most common words in each category

common_words = []
for cat in categories:
    cat_words = []
    cat = training_data.loc[training_data['category']==cat, ]
    for row in cat.index:
        for token in cat.loc[row, 'jobtitle']:
            cat_words.append(token.lemma_)
    cat_results = [item for item in Counter(cat_words).most_common(100)]
    for word in cat_results:
        common_words.append(word[0])

unique_words = set(common_words)

In [16]:
len(unique_words)

344

In [17]:
'Bay' in unique_words

False

In [18]:
training_data.head()

Unnamed: 0,category,jobtitle
3120,foodbeverage,"[peet, coffee, tea, hiring, retail, staff, ori..."
7330,labor,"[construction, laborer, $, 15/hr]"
10059,administrative,"[earn, learn]"
2580,foodbeverage,"[cook, food, service, worker]"
3177,foodbeverage,"[general, manager, food, facility, menlo, park..."


In [19]:
def create_bow(df):

    start_time = time.process_time()
    
    df_bow = pd.DataFrame(columns=unique_words)
    df_bow['jobtitle'] = training_data['jobtitle']
    df_bow['category'] = training_data['category']

    for row in df_bow.index:
        jobwords = df_bow.loc[row, 'jobtitle']
        jobwords_count = [item for item in Counter(jobwords).most_common()]
        for word in jobwords_count:
            if str(word[0]) in unique_words:
                df_bow.loc[row, str(word[0])] = word[1]
        if row % 50 == 0:
            print('Processing row {}'.format(row))
            print('Time taken: {}'.format(time.process_time() - start_time))
    
    return df_bow

In [20]:
x_train_bow = create_bow(training_data)

Processing row 2300
Time taken: 2.8830930000000023
Processing row 4700
Time taken: 13.133588000000003
Processing row 3650
Time taken: 16.086872
Processing row 11100
Time taken: 21.948888999999994
Processing row 2500
Time taken: 37.04075399999999
Processing row 10450
Time taken: 48.67643100000001
Processing row 9450
Time taken: 64.177649
Processing row 9000
Time taken: 69.160292
Processing row 2850
Time taken: 77.360949
Processing row 10200
Time taken: 80.900185
Processing row 7800
Time taken: 82.08648600000001
Processing row 5150
Time taken: 84.836233
Processing row 6650
Time taken: 86.63709399999999
Processing row 7600
Time taken: 87.98988299999999
Processing row 11450
Time taken: 90.57564099999999
Processing row 8950
Time taken: 93.825839
Processing row 2450
Time taken: 101.233747
Processing row 2250
Time taken: 101.57480699999999
Processing row 10400
Time taken: 105.230444
Processing row 3050
Time taken: 109.26368400000001
Processing row 2600
Time taken: 118.124606
Processing row 10

Processing row 10700
Time taken: 1017.8809239999999
Processing row 4400
Time taken: 1023.010789
Processing row 12900
Time taken: 1026.904438
Processing row 12350
Time taken: 1028.900512
Processing row 1350
Time taken: 1029.0185239999998
Processing row 9600
Time taken: 1029.6770849999998
Processing row 11150
Time taken: 1049.3919489999998
Processing row 8200
Time taken: 1052.95159
Processing row 13050
Time taken: 1058.0432549999998
Processing row 5600
Time taken: 1060.972603
Processing row 1550
Time taken: 1069.6726879999999
Processing row 8450
Time taken: 1072.514539
Processing row 13000
Time taken: 1079.1257719999999
Processing row 7550
Time taken: 1081.297366
Processing row 9350
Time taken: 1082.349317
Processing row 10600
Time taken: 1086.5033899999999
Processing row 4850
Time taken: 1088.181398
Processing row 5750
Time taken: 1088.686231
Processing row 11900
Time taken: 1091.3630449999998
Processing row 13550
Time taken: 1093.8458449999998
Processing row 7050
Time taken: 1099.88000

In [21]:
x_train_bow.loc[0, 'school'] ## Should be 1

1

In [22]:
x_train_bow.shape

(10383, 346)

In [23]:
x_train_bow.head()

Unnamed: 0,event,teacher,planning,exp,driver!,require,,insurance,clean,dining,...,ar,jose,resource,accounting,behavioral,housekeeper,executive,design,jobtitle,category
3120,,,,,,,,,,,...,,,,,,,,,"[peet, coffee, tea, hiring, retail, staff, ori...",foodbeverage
7330,,,,,,,,,,,...,,,,,,,,,"[construction, laborer, $, 15/hr]",labor
10059,,,,,,,,,,,...,,,,,,,,,"[earn, learn]",administrative
2580,,,,,,,,,,,...,,,,,,,,,"[cook, food, service, worker]",foodbeverage
3177,,,,,,,,,,,...,,,,,,,,,"[general, manager, food, facility, menlo, park...",foodbeverage


In [24]:
x_train_bow.drop(['jobtitle', 'category'], axis=1, inplace=True)

In [30]:
x_train_bow.fillna(value=0, inplace=True)

### tf-idf

In [31]:
## Need data in strings not tokens

x_str = cl['jobtitle']
y_str = cl['category']

x_train_str, x_test_str, y_train_str, y_test_str = train_test_split(x_str, y_str, random_state=98,
                                                   stratify=y_str)

In [32]:
vectorizer = TfidfVectorizer(max_df = 0.9,
                            min_df = 10,
                            stop_words='english',
                            use_idf=True,
                             lowercase=True,
                            norm=u'l2',
                            smooth_idf=True)

In [33]:
x_train_str[0]

'After School Supervisor'

In [34]:
x_train_tfidf = vectorizer.fit_transform(x_train_str)
print('Number of features: {}'.format(x_train_tfidf.get_shape()[1]))

Number of features: 726


## Fit supervised learning models

### BoW model

In [35]:
## Logistic regression

lr1_bow = LogisticRegression(penalty='l2', random_state=98)

lr1_scores_bow = cross_val_score(lr1_bow, x_train_bow, y_train, cv=5)

print('Logistic Regression scores:', lr1_scores_bow)



Logistic Regression scores: [0.75842156 0.75589793 0.74337987 0.7504817  0.74987952]


In [36]:
## SVM

svm1_bow = SVC(kernel='linear', random_state=98)

svm1_scores_bow = cross_val_score(svm1_bow, x_train_bow, y_train, cv=5)

print('SVM scores:', svm1_scores_bow)

SVM scores: [0.74879692 0.75108329 0.73760231 0.74132948 0.75084337]


In [40]:
## Both models performed similarly.

### tf-idf model

In [37]:
## Logistic regression

lr1_tfidf = LogisticRegression(penalty='l2', random_state=98)

lr1_scores_tfidf = cross_val_score(lr1_tfidf, x_train_tfidf, y_train_str, cv=5)

print('Logistic Regression scores:', lr1_scores_tfidf)



Logistic Regression scores: [0.81183831 0.80548869 0.80115551 0.81069364 0.80674699]


In [38]:
## SVM

svm1_tfidf = SVC(kernel='linear', random_state=98)

svm1_scores_tfidf = cross_val_score(svm1_tfidf, x_train_tfidf, y_train, cv=5)

print('SVM scores:', svm1_scores_tfidf)

SVM scores: [0.80943215 0.80308137 0.80308137 0.80202312 0.80481928]


In [41]:
## Both models performed similarly again, though tf-idf appears to be
## better than bag of words by around 5%.

## Improving tf-idf

In [42]:
## Tweak some hyperparameters

vectorizer2 = TfidfVectorizer(max_df = 0.6, ## Ignore more regularly appearing words
                            min_df = 5, ## Lower threshold for inclusion
                            stop_words=None, ## Remove stopwords as less important when not whole sentences
                            use_idf=True,
                             lowercase=True,
                            norm=u'l2',
                            smooth_idf=True)

In [43]:
x_train_tfidf2 = vectorizer2.fit_transform(x_train_str)
print('Number of features: {}'.format(x_train_tfidf2.get_shape()[1]))

Number of features: 1272


In [44]:
## Almost double number of features

In [45]:
## Logistic regression

lr2_tfidf = LogisticRegression(penalty='l2', random_state=98)

lr2_scores_tfidf = cross_val_score(lr2_tfidf, x_train_tfidf2, y_train_str, cv=5)

print('Logistic Regression scores:', lr2_scores_tfidf)



Logistic Regression scores: [0.81953802 0.81608089 0.82426577 0.8155106  0.81831325]


In [48]:
## Improved by maybe 2% points

In [47]:
## SVM

svm2_tfidf = SVC(kernel='linear', random_state=98)

svm2_scores_tfidf = cross_val_score(svm2_tfidf, x_train_tfidf2, y_train, cv=5)

print('SVM scores:', svm2_scores_tfidf)

SVM scores: [0.81713186 0.81656235 0.81415503 0.82418112 0.81060241]


In [49]:
## Again improved by 1-2% points

In [50]:
## Try grid searching for best hyperparameters

In [53]:
## Logistic regression gridsearch

lr_params = {'C': [0.01, 0.1, 1, 10, 100], 
             'tol': [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.01, 0.1]}

gs_lr2_tfidf = GridSearchCV(lr2_tfidf, param_grid=lr_params, cv=5)

gs_lr2_tfidf_results = gs_lr2_tfidf.fit(x_train_tfidf2, y=y_train_str)



In [55]:
gs_lr2_tfidf_results.best_score_

0.8189347972647597

In [56]:
gs_lr2_tfidf_results.best_params_

{'C': 10, 'tol': 0.01}

In [57]:
## No real improvement here.

In [59]:
## SVM gridsearch
svm3_tfidf = SVC(random_state=98)

svm_params = {'C': [0.01, 0.1, 1, 10, 100],
             'kernel': ['linear', 'rbf']}

gs_svm3_tfidf = GridSearchCV(svm3_tfidf, param_grid=svm_params, cv=5, n_jobs=3, verbose=10)

gs_svm3_tfidf_results = gs_svm3_tfidf.fit(x_train_tfidf2, y=y_train_str)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   14.0s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   39.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   53.1s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed:  3.1min finished


In [60]:
gs_svm3_tfidf_results.best_score_

0.8165270153134933

In [61]:
## No real improvement here either

In [63]:
## Logistic regression with L1 penalty

lr3_tfidf = LogisticRegression(penalty='l1', random_state=98)

lr3_scores_tfidf = cross_val_score(lr3_tfidf, x_train_tfidf2, y_train_str, cv=5)

print('Logistic Regression scores:', lr3_scores_tfidf)



Logistic Regression scores: [0.81520693 0.81367357 0.80837747 0.80635838 0.81156627]


In [64]:
## Try tweaking the hyperparameters again

vectorizer3 = TfidfVectorizer(max_df = 0.9, ## Include most words
                            min_df = 2, ## Lower threshold for inclusion further
                            stop_words=None, ## Remove stopwords as less important when not whole sentences
                            use_idf=True,
                             lowercase=True,
                            norm=u'l2',
                            smooth_idf=True)

In [65]:
x_train_tfidf3 = vectorizer3.fit_transform(x_train_str)
print('Number of features: {}'.format(x_train_tfidf3.get_shape()[1]))

Number of features: 2495


In [66]:
## Around 50% more features again

In [67]:
## Logistic regression

lr4_tfidf = LogisticRegression(penalty='l2', random_state=98)

lr4_scores_tfidf = cross_val_score(lr4_tfidf, x_train_tfidf3, y_train_str, cv=5)

print('Logistic Regression scores:', lr4_scores_tfidf)



Logistic Regression scores: [0.82290664 0.82089552 0.82571016 0.82177264 0.82361446]


In [68]:
## Another 1% ish

In [69]:
## SVM

svm4_tfidf = SVC(kernel='linear', random_state=98)

svm4_scores_tfidf = cross_val_score(svm4_tfidf, x_train_tfidf3, y_train, cv=5)

print('SVM scores:', svm4_scores_tfidf)

SVM scores: [0.82242541 0.82619162 0.82956187 0.82803468 0.81686747]


In [70]:
## SVM rbf kernel

svm5_tfidf = SVC(kernel='rbf', random_state=98)

svm5_scores_tfidf = cross_val_score(svm5_tfidf, x_train_tfidf3, y_train, cv=5)

print('SVM scores:', svm5_scores_tfidf)



SVM scores: [0.30028874 0.28839673 0.3033221  0.30105973 0.30698795]


In [None]:
## Wow this is bad. Let's stick with linear kernel.