# Check if xgboost model is just an aggregate of averages from features
This is to satisfy an inquiry from Dylan, in order to pass the Capstone project.

### Load libraries

In [31]:
import pandas as pd
import numpy as np
import os

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from collections import defaultdict
import pickle

import gc
import warnings
warnings.simplefilter(action='ignore')

## Load model and data set

In [11]:
cwd = os.getcwd()
PARENTDIR = os.path.split(os.path.split(cwd)[0])[0]
DATADIR = os.path.join(PARENTDIR, 'Capstone-WebApp-backup/models')
CSV_file = ("Dataset-for-ML-Model.csv")

In [9]:
with open(os.path.join(cwd, "TDI-XGB_model.pkl"), "rb") as f:
    clf = pickle.load(f)

In [12]:
frame = pd.read_csv(os.path.join(DATADIR, CSV_file), encoding = "ISO-8859-1")

## Load label encoder classes

In [7]:
d = defaultdict(LabelEncoder)
cols_transf = ['employer', 'job title', 'state', 'city']

for col in cols_transf:
    d[col] = LabelEncoder()
    d[col].classes_ = np.load(os.path.join(cwd, '{}.npy'.format(col).replace(' ', '_')))

### Process data

In [13]:
# Note: this function requires a cleaned up 'location' column with no NaN or empty strings
def get_city_state(df):
    df["state"] = df["location"].str.split().str[-1]
    df["location"].apply(lambda x: "".join(x.split()[1:]))
    df["location"] = df["location"].apply(lambda x: x.split(" "))
    df["city"] = df["location"].str[:-1].apply(lambda x: " ".join(x))
    
    df = df[['employer', 'job title', 'base salary', 'submit date',
       'start date', 'case status', 'submit year', 'submit month', 'state', 'city']]
    
    return df

In [14]:
frame = get_city_state(frame)
X = frame[['employer', 'job title', 'submit year', 'state', 'city']]
y = np.log1p(frame[['base salary']].values)

### Label Encoding

In [15]:
def fit_and_transform(df,cols_to_transf):
    df_transf = df[cols_to_transf]
    df_non_transf = df.drop(cols_to_transf, axis = 1)
    
    fit = df_transf.apply(lambda x: d[x.name].fit_transform(x))
    
    df = pd.concat([fit, df_non_transf], axis=1, join='outer')
    return df


def inverse_encoding(df,encoded_cols):
    df_inverse = df[encoded_cols]
    df_non_inv = df.drop(encoded_cols, axis = 1)
    
    df_inverse = df_inverse.apply(lambda x: d[x.name].inverse_transform(x))
    df = pd.concat([df_inverse, df_non_inv], axis=1, join='outer')
    
    return df


def encode_future_data(df,cols_to_transf):
    df_transf = df[cols_to_transf]
    df_non_transf = df.drop(cols_to_transf, axis = 1)
    
    fit = df_transf.apply(lambda x: d[x.name].transform(x))
    
    df = pd.concat([fit, df_non_transf], axis=1, join='outer')
    return df

In [17]:
d = defaultdict(LabelEncoder)
cols_to_transf = ['employer','job title','state','city']

In [18]:
X = fit_and_transform(X, cols_to_transf)

In [19]:
test_size = 0.25 # assign proportion of dataset to test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

del X, y; gc.collect()

43

### Verify test set

In [20]:
d_test = xgb.DMatrix(X_test)

In [21]:
predict = clf.predict(d_test)

In [28]:
print('Prediction RMSE = ${:.2f}'.format(np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(predict)))))
print('R2 = {:.3f}'.format(r2_score(y_test, predict)))

Prediction RMSE = $20249.07
R2 = 0.752


In [29]:
predict_dict = {
    'employer':     ['Google Inc', 'Georgia Institute of Technology','Ove Arup & Partners PC'],
    'job title':    ['Data Scientist', 'Assistant Professor', 'Mechanical Engineer'],
    'state':        ['NY', 'GA', 'NY'],
    'city':         ['New York', 'Atlanta', 'New York'],
    'submit year':  [2018, 2018, 2018]}

In [30]:
def preprocess_new_data(predict_dict, cols_to_transf, future_data_column_order):
    predict_df = pd.DataFrame.from_dict(predict_dict)
    predict_df_transf = predict_df[cols_to_transf]
    predict_df_non_transf = predict_df.drop(cols_to_transf, axis=1)
    
    for col in predict_df_transf.columns:
        predict_df_transf[col] = predict_df_transf[col].str.upper()
    
    predict_df = pd.concat([predict_df_transf, predict_df_non_transf], axis=1, join='outer')   
    predict_df = encode_future_data(predict_df,cols_to_transf)
    predict_df = predict_df[future_data_column_order]
    
    return predict_df

In [33]:
future_data_column_order = ['employer', 'job title', 'state', 'city', 'submit year']
predict_df = preprocess_new_data(predict_dict, cols_to_transf, future_data_column_order)

In [34]:
d_new_data = xgb.DMatrix(predict_df)
new_predictions = clf.predict(d_new_data)

In [35]:
for i, title in enumerate(predict_dict['job title']):
    print('Your predicted salary as ' + title.title() + ' at ' + predict_dict['employer'][i] +
          ' in ' + predict_dict['city'][i] + ' is ' + '${:,.0f}'.format(int(np.expm1(new_predictions[i]))))

Your predicted salary as Data Scientist at Google Inc in New York is $150,212
Your predicted salary as Assistant Professor at Georgia Institute of Technology in Atlanta is $107,958
Your predicted salary as Mechanical Engineer at Ove Arup & Partners PC in New York is $81,461


### Check averages of entries above

In [55]:
emp_conditions = ['GOOGLE INC', 'GEORGIA INSTITUTE OF TECHNOLOGY','OVE ARUP & PARTNERS PC']
city_conditions = ['NEW YORK', 'ATLANTA', 'NEW YORK']
state_conditions = ['NY', 'GA', 'NY']
title_conditions = ['DATA SCIENTIST', 'ASSISTANT PROFESSOR', 'MECHANICAL ENGINEER']
df_emp = frame[frame['employer'].isin(emp_conditions)]
df_city = frame[frame['city'].isin(city_conditions)]
df_state = frame[frame['state'].isin(state_conditions)]
df_title = frame[frame['job title'].isin(title_conditions)]

In [98]:
def print_averages_and_counts(df, groupby_col):
    df_groupby = df.groupby(groupby_col)['base salary']
    counts = df_groupby.count()
    avgs = df_groupby.mean()
    print('Item Counts: \n', list(zip(counts.index, counts.values)))
    print('Mean Salaries: \n', list(zip(avgs.index,['${:,.0f}'.format(value) for value in avgs.values])))
    
    return list(zip(counts.index, counts.values)), list(zip(avgs.index, avgs.values))

In [99]:
employer_counts, employer_mean_sal = print_averages_and_counts(df_emp, 'employer')

Item Counts: 
 [('GEORGIA INSTITUTE OF TECHNOLOGY', 227), ('GOOGLE INC', 3675), ('OVE ARUP & PARTNERS PC', 9)]
Mean Salaries: 
 [('GEORGIA INSTITUTE OF TECHNOLOGY', '$74,135'), ('GOOGLE INC', '$133,491'), ('OVE ARUP & PARTNERS PC', '$106,252')]


In [100]:
city_counts, city_mean_sal = print_averages_and_counts(df_city, 'city')

Item Counts: 
 [('ATLANTA', 29855), ('NEW YORK', 73511)]
Mean Salaries: 
 [('ATLANTA', '$77,214'), ('NEW YORK', '$94,352')]


In [102]:
state_counts, state_mean_sal = print_averages_and_counts(df_state, 'state')

Item Counts: 
 [('GA', 57292), ('NY', 111185)]
Mean Salaries: 
 [('GA', '$74,776'), ('NY', '$87,626')]


In [103]:
job_counts, job_mean_sal = print_averages_and_counts(df_title, 'job title')

Item Counts: 
 [('ASSISTANT PROFESSOR', 20118), ('DATA SCIENTIST', 4947), ('MECHANICAL ENGINEER', 7010)]
Mean Salaries: 
 [('ASSISTANT PROFESSOR', '$105,398'), ('DATA SCIENTIST', '$106,439'), ('MECHANICAL ENGINEER', '$72,699')]


#### Take averages for the 3 job positions shown above

In [116]:
mean_sal_dict = defaultdict(list)

def grouped_averages_into_dict(sal_dict, count_list, mean_sal_list):
    for key, val in count_list:
        sal_dict[key].append(val)
    
    for key, val in mean_sal_list:
        sal_dict[key].append(val)
        
    return sal_dict

In [117]:
mean_sal_dict = (grouped_averages_into_dict(mean_sal_dict, employer_counts, employer_mean_sal))
mean_sal_dict = (grouped_averages_into_dict(mean_sal_dict, state_counts, state_mean_sal))
mean_sal_dict = (grouped_averages_into_dict(mean_sal_dict, city_counts, city_mean_sal))
mean_sal_dict = (grouped_averages_into_dict(mean_sal_dict, job_counts, job_mean_sal))

In [118]:
mean_sal_dict

defaultdict(list,
            {'GEORGIA INSTITUTE OF TECHNOLOGY': [227, 74135.26431718061],
             'GOOGLE INC': [3675, 133491.231292517],
             'OVE ARUP & PARTNERS PC': [9, 106252.22222222222],
             'GA': [57292, 74776.30662221601],
             'NY': [111185, 87626.47567567567],
             'ATLANTA': [29855, 77213.94757996986],
             'NEW YORK': [73511, 94351.66624042661],
             'ASSISTANT PROFESSOR': [20118, 105398.12342181131],
             'DATA SCIENTIST': [4947, 106439.0382049727],
             'MECHANICAL ENGINEER': [7010, 72699.29486447932]})

#### Get weighted averages

In [None]:
for i, title in enumerate(predict_dict['job title']):
    print('Your predicted salary as ' + title.title() + ' at ' + predict_dict['employer'][i] +
          ' in ' + predict_dict['city'][i] + ' is ' + '${:,.0f}'.format(int(np.expm1(new_predictions[i]))))

In [120]:
for k, v in mean_sal_dict.items():
    print(k, v)

GEORGIA INSTITUTE OF TECHNOLOGY [227, 74135.26431718061]
GOOGLE INC [3675, 133491.231292517]
OVE ARUP & PARTNERS PC [9, 106252.22222222222]
GA [57292, 74776.30662221601]
NY [111185, 87626.47567567567]
ATLANTA [29855, 77213.94757996986]
NEW YORK [73511, 94351.66624042661]
ASSISTANT PROFESSOR [20118, 105398.12342181131]
DATA SCIENTIST [4947, 106439.0382049727]
MECHANICAL ENGINEER [7010, 72699.29486447932]


In [150]:
print('"Predicted" salaries using weighted means:')
for i, title in enumerate(predict_dict['job title']):
    numerator = 0
    denominator = 0
    for k in predict_dict.keys():
        if k != 'submit year':
            numerator += mean_sal_dict[predict_dict[k][i].upper()][0]*mean_sal_dict[predict_dict[k][i].upper()][1]
            denominator += mean_sal_dict[predict_dict[k][i].upper()][0]
    weighted_mean_sal = numerator/denominator
    
    print('Predicted salary as ' + title.title() + ' at ' + predict_dict['employer'][i] +
          ' in ' + predict_dict['city'][i] + ' is ' + '${:,.0f}'.format(int(weighted_mean_sal)))

print('-'*100)
print('Predicted salaries from machine learning model:')
for i, title in enumerate(predict_dict['job title']):
    print('Your predicted salary as ' + title.title() + ' at ' + predict_dict['employer'][i] +
          ' in ' + predict_dict['city'][i] + ' is ' + '${:,.0f}'.format(int(np.expm1(new_predictions[i]))))
    
print('-'*100)

print('"Predicted" salaries using ordinary arithmetic mean:')
for i, title in enumerate(predict_dict['job title']):
    numerator = 0
    denominator = 0
    for k in predict_dict.keys():
        if k != 'submit year':
            numerator += mean_sal_dict[predict_dict[k][i].upper()][1]
        
    arithmetic_mean_sal = numerator/4
    
    print('Predicted salary as ' + title.title() + ' at ' + predict_dict['employer'][i] +
          ' in ' + predict_dict['city'][i] + ' is ' + '${:,.0f}'.format(int(arithmetic_mean_sal)))

"Predicted" salaries using weighted means:
Predicted salary as Data Scientist at Google Inc in New York is $91,537
Predicted salary as Assistant Professor at Georgia Institute of Technology in Atlanta is $81,183
Predicted salary as Mechanical Engineer at Ove Arup & Partners PC in New York is $89,660
----------------------------------------------------------------------------------------------------
Predicted salaries from machine learning model:
Your predicted salary as Data Scientist at Google Inc in New York is $150,212
Your predicted salary as Assistant Professor at Georgia Institute of Technology in Atlanta is $107,958
Your predicted salary as Mechanical Engineer at Ove Arup & Partners PC in New York is $81,461
----------------------------------------------------------------------------------------------------
"Predicted" salaries using ordinary arithmetic mean:
Predicted salary as Data Scientist at Google Inc in New York is $105,477
Predicted salary as Assistant Professor at Georg