In [1]:
import pandas as pd
# from nltk.tokenize import word_tokenize
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

import logging

def setupLogging():
    logging.basicConfig(format='%(asctime)s  [%(levelname)s]  %(message)s', level=logging.DEBUG)

setupLogging()

In [2]:
# let's read in the data

In [3]:
healthcare_titles = pd.read_csv("Job_Titles_LABELED.csv", nrows=10000)
healthcare_titles.head(10)

Unnamed: 0,title,code,minpay,midpay,maxpay,avgpay,tokens,label
0,director programs,,,,,"$38,581.70","['director', 'programs']",Director
1,healthcare access navigator,,,,,"$13,053.50","['healthcare', 'access', 'navigator']",Administration
2,residential assistant,,,,,"$8,061.50","['residential', 'assistant']",Assistant
3,senior associate teacher,,,,,"$15,425.42","['senior', 'associate', 'teacher']",Education & Learning
4,hm general labor,,,,,"$10,640.70","['hm', 'general', 'labor']","Security, Maintenance, & Housekeeping"
5,ptlaborer,,,,,"$4,499.25",['ptlaborer'],Physical Therapy
6,sarah jane dietary,,,,,"$6,241.50","['sarah', 'jane', 'dietary']",Dietary
7,psychiatrist,1002.0,"$46,823.17","$58,258.14","$60,544.96","$80,932.20",['psychiatrist'],Psychiatrist
8,medical director,1003.0,"$52,848.32","$65,940.22","$68,558.56","$116,000.76","['medical', 'director']",Director
9,health care physician,1008.0,"$39,183.87","$48,518.62","$57,853.38","$13,580.04","['health', 'care', 'physician']",Physician


In [4]:
# let's clean the data, removing columns we don't need

In [5]:
def clean_data(dataframe):
    master_encoding_list = []
    master_vector_list = []
    dataframe = dataframe.drop(dataframe.columns[1:6], axis=1)
    
    # dataframe['token title'] = dataframe.apply(lambda row: word_tokenize(row['title']), axis=1)
    
    return dataframe
    
    
cleaned_data = clean_data(healthcare_titles)
print(cleaned_data.head(10))

                         title                                 tokens  \
0            director programs               ['director', 'programs']   
1  healthcare access navigator  ['healthcare', 'access', 'navigator']   
2        residential assistant           ['residential', 'assistant']   
3     senior associate teacher     ['senior', 'associate', 'teacher']   
4             hm general labor             ['hm', 'general', 'labor']   
5                    ptlaborer                          ['ptlaborer']   
6           sarah jane dietary           ['sarah', 'jane', 'dietary']   
7                 psychiatrist                       ['psychiatrist']   
8             medical director                ['medical', 'director']   
9        health care physician        ['health', 'care', 'physician']   

                                   label  
0                               Director  
1                         Administration  
2                              Assistant  
3                   Educ

In [6]:
# split the data for training and testing, 

In [7]:
cntvect =CountVectorizer()
count_vect = cntvect.fit_transform(cleaned_data['title'])
X = count_vect
y = cleaned_data['label']

print(X)
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 39)

  (0, 1650)	1
  (0, 2729)	1
  (1, 1972)	1
  (1, 12)	1
  (1, 2427)	1
  (2, 2859)	1
  (2, 397)	1
  (3, 2960)	1
  (3, 446)	1
  (3, 3171)	1
  (4, 2000)	1
  (4, 1917)	1
  (4, 2182)	1
  (5, 2753)	1
  (6, 2919)	1
  (6, 2156)	1
  (6, 1615)	1
  (7, 2748)	1
  (8, 1650)	1
  (8, 2348)	1
  (9, 1971)	1
  (9, 781)	1
  (9, 2635)	1
  (10, 2750)	1
  (11, 2750)	1
  :	:
  (9991, 595)	1
  (9992, 1650)	1
  (9992, 1971)	1
  (9992, 595)	1
  (9992, 2998)	1
  (9993, 1650)	1
  (9993, 595)	1
  (9993, 2093)	1
  (9994, 1650)	1
  (9994, 2968)	1
  (9994, 595)	1
  (9995, 1650)	1
  (9995, 2963)	1
  (9995, 607)	1
  (9996, 1650)	1
  (9996, 608)	1
  (9997, 1650)	1
  (9997, 901)	1
  (9997, 608)	1
  (9998, 1650)	1
  (9998, 1005)	1
  (9998, 1742)	1
  (9998, 608)	1
  (9999, 1650)	1
  (9999, 633)	1


In [8]:
# create models to test in grid search

In [9]:
def run_cv_search():
    models_list = []
    
    svm_clf = svm.SVC()
    svm_params = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    svm_list = [svm_clf, svm_params]
    models_list.append(svm_list)
    
    rf_clf = RandomForestClassifier()
    rf_params = {'n_estimators': [10, 25]}
    rf_list = [rf_clf, rf_params]
    models_list.append(rf_list)
    
    mnb_clf = MultinomialNB()
    mnb_params = {'alpha': [0.01, 0.03]}
    mnb_list = [mnb_clf, mnb_params]
    models_list.append(mnb_list)
    
    return models_list

test_models_list = run_cv_search()
    

In [10]:
print(test_models_list)

[[SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False), {'kernel': ('linear', 'rbf'), 'C': [1, 10]}], [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), {'n_estimators': [10, 25]}], [MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), {'alpha': [0.01, 0.03]}]]


In [11]:
# try running a grid search CV
# logging to track progress

In [12]:
def get_models(models_list, Xtrain, ytrain):
    model_review = []
    for model in models_list:
        print('start')
        use_all_processors = -1
        
        the_clf = GridSearchCV(estimator=model[0], param_grid=model[1], n_jobs=use_all_processors, verbose=100,
                               cv=2, error_score='raise')
        print('processing')
        the_clf.fit(Xtrain, ytrain)
        
        model_review.append([the_clf, the_clf.best_params_, the_clf.best_score_])
        print('end')
        
    return model_review

trained_models = get_models(test_models_list, X_train, y_train)
print(trained_models)

start
processing
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(7000,), dtype=object).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(3482,), dtype=int32).
Pickling array (shape=(3518,), dtype=int32).




Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(7000,), dtype=object).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(3518,), dtype=int32).
Pickling array (shape=(3482,), dtype=int32).
Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(7000,), dtype=object).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(3482,), dtype=int32).
Pickling array (shape=(3518,), dtype=int32).
Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(7000,), dtype=object).
Pickling array (shape=(7000,), dtype=int64).
P



[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    0.8s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.4s finished
end
start
processing
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(7000,), dtype=object).
Pickling array (shape=(7000,), dtype=int64).
Pickling array (shape=(3482,), dtype=int32).
Pickling array (shape=(3518,), dtype=int32).
Pickling array (shape=(20057,), dtype=int32).
Pickling array (shape=(7001,), dtype=int32).
Pickling array (shape=(20057,), dtype=int64).
Pickling array (shape=(7000,), dtype=int64).
Pick



end
[[GridSearchCV(cv=2, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100), {'C': 10, 'kernel': 'linear'}, 0.8484285714285714], [GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random

In [13]:
# after list is printed, select best model and run it against the test data

In [14]:
best_score = 0
ideal_model = ''

for model in trained_models:
    if model[2] > best_score:
        best_score = model[2]
        ideal_model = model
    else:
        continue
        
print(ideal_model)        
    

[GridSearchCV(cv=2, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100), {'C': 10, 'kernel': 'linear'}, 0.8484285714285714]


In [15]:
# I am assuming the model is already fit to the training data so we don't need to do that again
# let's test the final model

In [16]:
y_pred = ideal_model[0].predict(x_test)
print(y_pred)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


['Development' 'Chief Officer' 'Assistant' ... 'Chief Officer' 'Assistant'
 'Supervisor']
[[  5   0   0 ...   0   0   0]
 [  0 310   2 ...   0   3   0]
 [  0   0   4 ...   0   0   0]
 ...
 [  0   0   0 ...   6   0   0]
 [  0   0   0 ...   0   6   0]
 [  0   0   0 ...   0   0  15]]
                                       precision    recall  f1-score   support

                           Activities       0.45      0.62      0.53         8
                       Administration       0.84      0.84      0.84       370
                              Advisor       0.67      1.00      0.80         4
                                 Aide       0.52      0.67      0.59        18
                     Anesthesiologist       1.00      1.00      1.00         1
                            Assistant       0.96      0.85      0.90       355
        Assistant Director of Nursing       0.00      0.00      0.00         2
                            Associate       0.18      0.80      0.30         5
      

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [17]:
# let's try testing this new vectorizer

In [19]:
test_data =  [
        'Senior Manager',
        'Executive Director', 
        'Senior Plan Administrator',
        'Chief Executive Officer',
        'Senior Data Architect',
        'Nurse'
    ]

test_titles = {"title": test_data}
test_titles_df = pd.DataFrame(test_data, columns=['title'])

test_vect = cntvect.transform(test_titles_df['title'])
x2 = test_vect

test_pred = ideal_model[0].predict(x2)
print(test_pred)


['Manager' 'Director' 'Administration' 'Chief Officer' 'Technology & IT'
 'Nursing']
