In [None]:
#!conda install scikit-learn=0.22 -y

In [134]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from matplotlib import pyplot
import pickle

In [5]:
#import data
def import_data(file):
    return pd.read_csv(file)

In [6]:
#join feature and target data
def join_data(df1, df2, key=None, left_index=False, right_index=False):
    return pd.merge(df1, df2, how='inner', on=key, left_index=left_index, right_index=right_index)

In [7]:
#remove duplicates and any rows with 0 salary
def clean(data):
    clean_data = data.drop_duplicates(subset='jobId')
    clean_data = clean_data[clean_data.salary>0]
    return clean_data

In [67]:
def ohe(df, cat_vars=None, num_vars=None):
    cat_df = pd.get_dummies(df, prefix_sep="__", columns=cat_vars)
    #num_df = df[num_vars].apply(pd.to_numeric) 
    return cat_df

In [9]:
# return a dataframe with only target feature
def get_target_df(df, target):
    return df[target]

In [119]:
#pass the model and feature and target dataframes, calculate the mse,std and return a list of dictionary
def train_model(models, feature_df,target_df, num_procs):
    results = []
    for model in models:
        print("training model {} started ".format(model))
        neg_mse = cross_val_score(model, feature_df,target_df, cv=5, n_jobs=num_procs, scoring='neg_mean_squared_error')
        mean_mse = -1.0*np.mean(neg_mse)
        cv_std = np.std(neg_mse)  
        results.append({'model':model,'MSE':mean_mse ,'CV':cv_std})
        print("training model {} complete ".format(model))
    return results

In [109]:
def evaluate_model(models, X,y):
    results = []
    for model in models:
        
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=4, error_score='raise')
        results.append({'model':model,'Score':scores ,'CV':cv})
    
    return results

In [11]:
#display model information
def print_summary(model):
    print('\nModel:\n', model['model'])
    print('Average MSE:\n', model['MSE'])
    print('Standard deviation during CV:\n', model['CV'])

In [12]:
# save predictions, feature importances to flat files
def save_results(model, mean_mse, predictions, feature_importance):
    with open('model.txt', 'w') as file:
        file.write(str(model))
    feature_importance.to_csv('feature_importance.csv')
    np.savetxt('predictions.csv', predictions, delimiter=',')

In [13]:
def save_model(model):
     #save model to disk
    pickle.dump(model, open('model.pkl','wb'))
    # load the model to compare the results
    model = pickle.load(open('model.pkl','rb'))

In [123]:
# get a list of models to evaluate
def get_models():
    models = [] 

    knn = KNeighborsRegressor()
    cart = DecisionTreeRegressor()
    #svm = SVR()
    gbr = GradientBoostingRegressor(max_depth=1, loss='ls', verbose=verbose_lvl)
    stacking = get_stacking()
    models.extend([knn,cart,gbr,stacking])
    
    return models

In [15]:
def col_dummies(df_processed):
    cat_dummies = [col for col in df_processed if "__" in col and col.split("__")[0] in cat_vars]
    return cat_dummies

In [16]:
def rem_addition_cols(df_test_processed,cat_vars,cat_dummies,df_processed_columns):
    for col in df_test_processed.columns:
        if("__" in col) and (col.split("__"[0]) in cat_vars) and col not in cat_dummies:
            print("Removing additional feature {}".format(col))
            df_test_processed.drop(col, axis=1, inplace = True)
        else:
            print("Nothing to remove")

    for col in cat_dummies:
        if col not in df_test_processed.columns and col != ['salary']:
            print("Adding missing feature {}".format(col))
            df_test_processed[col] = 0
    df = df_test_processed[df_processed_columns]
    return df

In [122]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = []
    knn = KNeighborsRegressor()
    cart = DecisionTreeRegressor()
    #svm = SVR()
    gbr = GradientBoostingRegressor(max_depth=1, loss='ls', verbose=verbose_lvl)
    level0.extend([knn,cart,gbr])
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=2)
    return model

In [63]:
#training data
train_features_file = 'train_features.csv'
test_features_file = 'test_features.csv'
train_salaries_file = 'train_salaries.csv'

#seperate categorical and numeric features 
cat_vars = ['jobType','degree','major','industry']
num_vars = ['yearsExperience','milesFromMetropolis']
target_var = 'salary'

all_col = ['jobType','degree','major','industry','yearsExperience','milesFromMetropolis']
#use helper functions to load the data
print('Loading data...')
features_df = import_data(train_features_file)
test_df = import_data(test_features_file)
target_df = import_data(train_salaries_file)
X_train = join_data(features_df,target_df)
X_train = clean(X_train)
X_train = X_train[all_col]
X_val = clean(target_df)

x_val = clean(target_df)
X_test = test_df[all_col]
print('Loading data complete...')

Loading data...
Loading data complete...


In [68]:
X_train_ohe = ohe(X_train,cat_vars=cat_vars,num_vars=num_vars)
#X_test_ohe = ohe(X_test,cat_vars=cat_vars)
#X_train_ohe.drop(X_train_ohe.columns[[0,1]], axis=1, inplace=True)
X_train_ohe

Unnamed: 0,yearsExperience,milesFromMetropolis,jobType__CEO,jobType__CFO,jobType__CTO,jobType__JANITOR,jobType__JUNIOR,jobType__MANAGER,jobType__SENIOR,jobType__VICE_PRESIDENT,...,major__MATH,major__NONE,major__PHYSICS,industry__AUTO,industry__EDUCATION,industry__FINANCE,industry__HEALTH,industry__OIL,industry__SERVICE,industry__WEB
0,10,83,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,3,73,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,10,38,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
3,8,17,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,8,16,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,19,94,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
999996,12,35,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
999997,16,81,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
999998,6,5,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [70]:
y = X_val.salary
X_train, X_test, y_train, y_test = train_test_split(X_train_ohe, y, test_size=0.2)


In [None]:
models = []

num_procs=5

verbose_lvl = 0
print('Begin cross validation...')
lr= LinearRegression()
#lr_std_pca= make_pipeline(StandardScaler, PCA(), LinearRegression())
rf = RandomForestRegressor(n_jobs=num_procs, max_depth=25, min_samples_split=60, \
                           max_features=20, verbose=verbose_lvl)
gbn = GradientBoostingRegressor(max_depth=5, loss='ls', verbose=verbose_lvl)
dct = DecisionTreeRegressor(random_state=1)
knn = KNeighborsRegressor()
# add the above models into a list to be iterated through
models.extend([lr,rf,gbn,knn])
# results will hold list of dictionary with models validations data
results = train_model(models, X_train, y_train,num_procs)
print('Cross validation complete.')
smodel = StackingRegressor(estimators=[models], final_estimator=LinearRegression(), cv=2)


Begin cross validation...
training model LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) started 
training model LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) complete 
training model RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=20, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=60, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=5, oob_score=False,
                      random_state=None, verbose=0, warm_start=False) started 
training model RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features=20, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_

In [138]:
smodel

StackingRegressor(cv=2,
                  estimators=[LinearRegression(copy_X=True, fit_intercept=True,
                                               n_jobs=None, normalize=False),
                              RandomForestRegressor(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    criterion='mse',
                                                    max_depth=25,
                                                    max_features=20,
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=60,
                                  

In [126]:
min_ = min(results, key=lambda x:x['MSE'])

In [127]:
model = min_['model']

model

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [139]:
smodel.fit(X_train, y_train)

TypeError: zip argument #1 must support iteration

In [129]:
result = model.predict(X_test)

In [130]:
result

array([109.8, 105.2,  92.8, ...,  45. ,  43.2,  64.8])

In [140]:
smodel.score(X_test,y_test)

AttributeError: 'StackingRegressor' object has no attribute 'final_estimator_'

In [52]:
save_model(model)

In [53]:
jobId = "JOB1362685407680"
companyId = "COMP21"
jobType = "CFO"
degree = "MASTERS"
major = "CHEMISTRY"
industry = "SERVICE"
yearsExperience = 4
milesFromMetropolis = 6


df1 = pd.DataFrame(data=[[jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis]],columns=['jobId', 'companyId','jobType','degree','major','industry','yearsExperience','milesFromMetropolis'])



In [55]:
df_new = ohe(df1,cat_vars)

In [60]:
cat_dummies = col_dummies(X_train_ohe)
df_processed_columns = list(X_train_ohe.columns[:])

In [61]:
df_ = rem_addition_cols(df_new,cat_vars,cat_dummies,df_processed_columns)

Nothing to remove
Nothing to remove
Nothing to remove
Nothing to remove
Nothing to remove
Nothing to remove
Nothing to remove
Nothing to remove
Adding missing feature jobType__CEO
Adding missing feature jobType__CTO
Adding missing feature jobType__JANITOR
Adding missing feature jobType__JUNIOR
Adding missing feature jobType__MANAGER
Adding missing feature jobType__SENIOR
Adding missing feature jobType__VICE_PRESIDENT
Adding missing feature degree__BACHELORS
Adding missing feature degree__DOCTORAL
Adding missing feature degree__HIGH_SCHOOL
Adding missing feature degree__NONE
Adding missing feature major__BIOLOGY
Adding missing feature major__BUSINESS
Adding missing feature major__COMPSCI
Adding missing feature major__ENGINEERING
Adding missing feature major__LITERATURE
Adding missing feature major__MATH
Adding missing feature major__NONE
Adding missing feature major__PHYSICS
Adding missing feature industry__AUTO
Adding missing feature industry__EDUCATION
Adding missing feature industry_

KeyError: "['salary'] not in index"

In [62]:
model.predict(df_)

NameError: name 'df_' is not defined