In [None]:
#!conda install scikit-learn=0.22 -y

In [65]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from matplotlib import pyplot

In [21]:
#import data
def import_data(file):
    return pd.read_csv(file)

In [22]:
#join feature and target data
def join_data(df1, df2, key=None, left_index=False, right_index=False):
    return pd.merge(df1, df2, how='inner', on=key, left_index=left_index, right_index=right_index)

In [23]:
#remove duplicates and any rows with 0 salary
def clean(data):
    clean_data = data.drop_duplicates(subset='jobId')
    clean_data = clean_data[clean_data.salary>0]
    return clean_data

In [24]:
def ohe(df, cat_vars=None, num_vars=None):
    cat_df = pd.get_dummies(df, prefix_sep="__", columns=cat_vars)
    #num_df = df[num_vars].apply(pd.to_numeric) 
    return cat_df

In [25]:
# return a dataframe with only target feature
def get_target_df(df, target):
    return df[target]

In [92]:
#pass the model and feature and target dataframes, calculate the mse,std and return a list of dictionary
def train_model(models, feature_df,target_df, num_procs):
    results = []
    for model in models:
        neg_mse = cross_val_score(model, feature_df,target_df, cv=5, n_jobs=num_procs, scoring='neg_mean_squared_error')
        mean_mse = -1.0*np.mean(neg_mse)
        cv_std = np.std(neg_mse)  
        results.append({'model':model,'MSE':mean_mse ,'CV':cv_std})
       
    return results

In [27]:
#display model information
def print_summary(model):
    print('\nModel:\n', model['model'])
    print('Average MSE:\n', model['MSE'])
    print('Standard deviation during CV:\n', model['CV'])

In [28]:
# save predictions, feature importances to flat files
def save_results(model, mean_mse, predictions, feature_importance):
    with open('model.txt', 'w') as file:
        file.write(str(model))
    feature_importance.to_csv('feature_importance.csv')
    np.savetxt('predictions.csv', predictions, delimiter=',')

In [29]:
def save_model(model):
     #save model to disk
    pickle.dump(model, open('model.pkl','wb'))
    # load the model to compare the results
    model = pickle.load(open('model.pkl','rb'))

In [60]:
# get a list of models to evaluate
def get_models():
    models = [] 

    knn = KNeighborsRegressor()
    cart = DecisionTreeRegressor()
    svm = SVR()
    gbr = GradientBoostingRegressor(max_depth=5, loss='ls', verbose=verbose_lvl)
    stacking = get_stacking()
    models.extend([knn,cart,svm,gbr,stacking])
    
    return models

In [31]:
def col_dummies(df_processed):
    cat_dummies = [col for col in df_processed if "__" in col and col.split("__")[0] in cat_vars]
    return cat_dummies

In [32]:
def rem_addition_cols(df_test_processed,cat_vars,cat_dummies,df_processed_columns):
    for col in df_test_processed.columns:
        if("__" in col) and (col.split("__"[0]) in cat_vars) and col not in cat_dummies:
            print("Removing additional feature {}".format(col))
            df_test_processed.drop(col, axis=1, inplace = True)
        else:
            print("Nothing to remove")

    for col in cat_dummies:
        if col not in df_test_processed.columns and col != ['salary']:
            print("Adding missing feature {}".format(col))
            df_test_processed[col] = 0
    df = df_test_processed[df_processed_columns]
    return df

In [55]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('gbr', GradientBoostingRegressor(max_depth=2, loss='ls', verbose=6)))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=6)
    return model

In [107]:
#training data
train_features_file = 'train_features.csv'
test_features_file = 'test_features.csv'
train_salaries_file = 'train_salaries.csv'

#seperate categorical and numeric features 
cat_vars = ['jobType','degree','major','industry']
num_vars = ['yearsExperience','milesFromMetropolis']
target_var = 'salary'

all_col = ['jobType','degree','major','industry','yearsExperience','milesFromMetropolis']
#use helper functions to load the data
print('Loading data...')
features_df = import_data(train_features_file)
test_df = import_data(test_features_file)
target_df = import_data(train_salaries_file)
X_train = join_data(features_df,target_df)
X_train = clean(X_train)
X_val = clean(target_df)

x_val = clean(target_df)
X_test = test_df[all_col]
print('Loading data complete...')

Loading data...
Loading data complete...


In [108]:
X_train_ohe = ohe(X_train,cat_vars=cat_vars)
#X_test_ohe = ohe(X_test,cat_vars=cat_vars)
X_train_ohe

Unnamed: 0,jobId,companyId,yearsExperience,milesFromMetropolis,salary,jobType__CEO,jobType__CFO,jobType__CTO,jobType__JANITOR,jobType__JUNIOR,...,major__MATH,major__NONE,major__PHYSICS,industry__AUTO,industry__EDUCATION,industry__FINANCE,industry__HEALTH,industry__OIL,industry__SERVICE,industry__WEB
0,JOB1362684407687,COMP37,10,83,130,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,JOB1362684407688,COMP19,3,73,101,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,JOB1362684407689,COMP52,10,38,137,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,JOB1362684407690,COMP38,8,17,142,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,JOB1362684407691,COMP7,8,16,163,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,JOB1362685407682,COMP56,19,94,88,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
999996,JOB1362685407683,COMP24,12,35,160,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
999997,JOB1362685407684,COMP23,16,81,64,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
999998,JOB1362685407685,COMP3,6,5,149,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [110]:
y = X_train_ohe.salary
X_train, X_test, y_train, y_test = train_test_split(X_train_ohe, y, test_size=0.2)

num_procs=5
verbose_lvl = 0

models = get_models()

print('Begin cross validation...')
# results will hold list of dictionary with models validations data
results = train_model(models, X_train, y_train, num_procs)
print(results)
print('Cross validation complete.')

Begin cross validation...
[{'model': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'), 'MSE': nan, 'CV': nan}, {'model': DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'), 'MSE': nan, 'CV': nan}, {'model': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False), 'MSE': nan, 'CV': nan}, {'model': GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_

In [111]:
min_ = min(results, key=lambda x:x['MSE'])

In [112]:
model = min_['model']

model

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')