In [1]:
import pandas as pd
import wrangle
import model
import model_MAE

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt
from scipy import stats

In [2]:
df, train_exp, X_train_scaled, y_train, X_test_scaled, y_test = wrangle.wrangle_data()

Acquire: compiling raw data files...
Acquire: Completed!
Prepare: preparing data files...
Prepare: Completed!
(289, 25) (73, 25)


In [3]:
X_train_scaled.columns

Index(['raw_svi', 'f_pov_soci', 'f_unemp_soci', 'f_pci_soci', 'f_nohsdp_soci',
       'f_age65_comp', 'f_age17_comp', 'f_disabl_comp', 'f_sngpnt_comp',
       'f_minrty_status', 'f_limeng_status', 'f_munit_trans', 'f_mobile_trans',
       'f_crowd_trans', 'f_noveh_trans', 'f_groupq_trans',
       'f_soci_total_scaled', 'f_comp_total_scaled', 'f_status_total_scaled',
       'f_trans_total_scaled', 'all_flags_total_scaled', 'rank_svi_scaled'],
      dtype='object')

# Create Baseline

In [4]:
y_train.tract_cases_per_100k.mean(), y_train.tract_cases_per_100k.median()

(3201.968351790576, 2903.7827641596423)

In [5]:
# MAE
mean_baseMAE, basepred1 = model_MAE.get_baseline_mean(y_train)

Baseline MAE: 973.0351509141836


In [6]:
# using mean
mean_baseRMSE, basepred = model.get_baseline_mean(y_train)

Baseline RMSE: 1257.2466801311145


In [7]:
# using median
median_baseMAE, basepred2 = model_MAE.get_baseline_median(y_train)

Baseline MAE: 950.9798103241462


In [8]:
# using median
median_baseRMSE, basepred = model.get_baseline_median(y_train)

Baseline RMSE: 1292.1237786572597


# Feature Ranking

In [9]:
# possible feature ranking?
rankdf = model.feature_ranking(X_train_scaled, y_train)
rankdf

Unnamed: 0_level_0,features
rank,Unnamed: 1_level_1
1,rank_svi_scaled
2,f_soci_total_scaled
3,f_pov_soci
4,f_minrty_status
5,all_flags_total_scaled
6,f_groupq_trans
7,f_age17_comp
8,f_comp_total_scaled
9,f_munit_trans
10,f_age65_comp


# Create X_train_scaled df with selected features to test

## Feature groups to use for OLS, PF, and Tweedie models

In [10]:
# only raw svi score
X_raw_svi = X_train_scaled[['raw_svi']]
# binned svi score by CDC range category = 1st ranked
X_rank_svi_only = X_train_scaled[['rank_svi_scaled']]
# top 4 ranked features
X_top4 = X_train_scaled[['rank_svi_scaled', 'f_soci_total_scaled', 'f_pov_soci', 'f_minrty_status']]
# only the summary of the flags = 19th ranked
X_all_flags_only = X_train_scaled[['all_flags_total_scaled']]
# only summary flags, should be the same as all flags total? = 5th, 12th, 15th, 21st
X_summary_flags = X_train_scaled[['f_comp_total_scaled', 'f_soci_total_scaled', 'f_status_total_scaled', 'f_trans_total_scaled']]
# all individual flags
X_not_summary_flags = X_train_scaled[['f_nohsdp_soci', 'f_minrty_status', 'f_groupq_trans', 'f_unemp_soci', 
                                     'f_disabl_comp', 'f_noveh_trans', 'f_mobile_trans', 'f_age65_comp', 
                                     'f_age17_comp', 'f_pov_soci', 'f_limeng_status', 'f_crowd_trans', 
                                      'f_pci_soci', 'f_sngpnt_comp', 'f_munit_trans']]

## LassoLars
- includes feature selection as part of model so will use all features on this model

## Drilling in features to test -- RETURN HERE

- what is the score using only 1 summary flag at a time?
    - if one summary group is better might be worth investigating for which features within group have greatest impact?

# Run df through regression algorithms

In [11]:
# create variables for loop
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Linear Regression Models
lm_MAE_list = []
for df in df2test:
    lm_MAE = model_MAE.linear_reg_train(df, target)
    lm_MAE_list.append(lm_MAE)

# Polynomial Features 2 degree
pf2_MAE_list = []
for df in df2test:
    pf2_MAE = model_MAE.polynomial2(df, target)
    pf2_MAE_list.append(pf2_MAE)

# Tweedie power=0, alpha=.5
tweedie05 = []
for df in df2test:
    tw_MAE = model_MAE.tweedie05(df, target)
    tweedie05.append(tw_MAE)


In [12]:
# # create variables for loop
# df2test = [X_rank_svi_only, X_top5, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled]
# target = y_train

# # Linear Regression Models
# lm_RMSE_list = []
# for df in df2test:
#     lm_RMSE = model.linear_reg_train(df, target)
#     lm_RMSE_list.append(lm_RMSE)

# # Polynomial Features 2 degree
# pf2_RMSE_list = []
# for df in df2test:
#     pf2_RMSE = model.polynomial2(df, target)
#     pf2_RMSE_list.append(pf2_RMSE)

# # Tweedie power=0, alpha=.5
# tweedie05 = []
# for df in df2test:
#     tw_RMSE = model.tweedie05(df, target)
#     tweedie05.append(tw_RMSE)


In [13]:
# LassoLars Models
lars_MAE = model_MAE.lasso_lars(X_train_scaled, target)

In [14]:
# LassoLars Models
lars_RMSE = model.lasso_lars(X_train_scaled, target)

# Summarize in results df

In [15]:
# create dataframe for results of all train models
df_list = ['rank_svi_only', 'top4', 'total_all_flags_only', 'summary_flags', 'not_summary_flags', 'all_features', 'raw_svi_only']

results = pd.DataFrame(df_list, columns=['Features'])
results['LinearRegression_MAE'] = lm_MAE_list
results['Tweedie05_MAE'] = tweedie05
results['PolynomialFeatures2_MAE'] = pf2_MAE_list
results['LassoLars_MAE'] = lars_MAE
results['Base_mean_MAE'] = mean_baseMAE
results['Base_median_MAE'] = median_baseMAE
results.sort_values('PolynomialFeatures2_MAE')

Unnamed: 0,Features,LinearRegression_MAE,Tweedie05_MAE,PolynomialFeatures2_MAE,LassoLars_MAE,Base_mean_MAE,Base_median_MAE
5,all_features,694.319979,821.826107,427.19019,697.418073,973.035151,950.97981
4,not_summary_flags,768.309031,890.061139,519.609825,697.418073,973.035151,950.97981
1,top4,720.911626,874.63259,708.392594,697.418073,973.035151,950.97981
6,raw_svi_only,776.969617,924.491852,767.016031,697.418073,973.035151,950.97981
0,rank_svi_only,775.678114,900.500732,772.144027,697.418073,973.035151,950.97981
3,summary_flags,830.585295,944.466544,780.583376,697.418073,973.035151,950.97981
2,total_all_flags_only,865.391283,958.467458,850.094118,697.418073,973.035151,950.97981


In [16]:
# # create dataframe for results of all train models
# df_list = ['rank_svi_only', 'top5', 'total_all_flags_only', 'summary_flags', 'not_summary_flags', 'all_features', 'raw_svi_only']

# results = pd.DataFrame(df_list, columns=['Features'])
# results['LinearRegression_RMSE'] = lm_RMSE_list
# results['Tweedie05_RMSE'] = tweedie05
# results['PolynomialFeatures2_RMSE'] = pf2_RMSE_list
# results['LassoLars_RMSE'] = lars_RMSE
# results['Base_mean_RMSE'] = mean_baseRMSE
# results['Base_median_RMSE'] = median_baseRMSE
# results.sort_values('PolynomialFeatures2_RMSE')

# Validate Stage

## pick models to validate
- LassoLars
- LR all feature
- PF2 all features
- PF2 top5


In [17]:
from numpy import mean
from numpy import std, absolute
from sklearn.datasets import make_blobs
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error

In [18]:
# LassoLars
# create loocv procedure
cvLL = LeaveOneOut()
# create model
modelLL = LassoLars(alpha=1)
# evaluate model
scoresLL = cross_val_score(modelLL, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=cvLL, n_jobs=-1)
# force positive
scoresLL = absolute(scoresLL)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresLL), std(scoresLL)))

MAE: 745.523 (707.596)


In [19]:
# Linear Regression - all features
# create loocv procedure
cvLR = LeaveOneOut()
# create model
modelLR = LinearRegression()
# evaluate model
scoresLR = cross_val_score(modelLR, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=cvLR, n_jobs=-1)
# force positive
scoresLR = absolute(scoresLR)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))

MAE: 753.715 (722.317)


In [20]:
# Polynomial Features 2 degrees - all features
# create loocv procedure
cvPF1 = LeaveOneOut()

# create PF object
pf1 = PolynomialFeatures(degree=2)
# Fit and Transform model to get a new set of features...which are the original features squared
X_train_squared1 = pf1.fit_transform(X_train_scaled)

# create model
modelPF1 = LinearRegression(normalize=True)
# evaluate model
scoresPF1 = cross_val_score(modelPF1, X_train_squared1, y_train, scoring='neg_mean_absolute_error', cv=cvPF1, n_jobs=-1)
# force positive
scoresPF1 = absolute(scoresPF1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresPF1), std(scoresPF1)))

MAE: 447076735235850.938 (1795309707961532.750)


In [21]:
# Polynomial Features 2 degrees - top 5
# create loocv procedure
cvPF2 = LeaveOneOut()

# create PF object
pf2 = PolynomialFeatures(degree=2)
# Fit and Transform model to get a new set of features...which are the original features squared
X_train_squared2 = pf2.fit_transform(X_top4)

# create model
modelPF2 = LinearRegression(normalize=True)
# evaluate model
scoresPF2 = cross_val_score(modelPF2, X_train_squared2, y_train, scoring='neg_mean_absolute_error', cv=cvPF2, n_jobs=-1)
# force positive
scoresPF2 = absolute(scoresPF2)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresPF2), std(scoresPF2)))

MAE: 752.923 (689.309)


# Test 

In [22]:
# fit model from cross validation to train dataset
modelLL.fit(X_train_scaled, y_train)
# use the trained dataset to predict on test
lars_test_pred = modelLL.predict(X_test_scaled)
# get the MAE of test
lars_MAE = mean_absolute_error(y_test, lars_test_pred)
print('MAE: %.3f' % lars_MAE)

MAE: 853.068


# new modeling section

In [23]:
# decided to use mean and mean absolute error for evaluation - this is now the baseline

# MAE
mean_baseMAE, basepred1 = model_MAE.get_baseline_mean(y_train)

Baseline MAE: 973.0351509141836


## figure out how to make cross validate function

In [24]:
# run one linear regresion
LRmeanMAE = model_MAE.cvLinearReg(X_train_scaled, y_train) 

MAE: 753.715 (722.317)


In [25]:
# Linear Regression
# now try running for all feature datasets
# create variables for loop
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Linear Regression Models
cvlm_MAE_list = []
for df in df2test:
    cvlm_MAE = model_MAE.cvLinearReg(df, target) 
    cvlm_MAE_list.append(cvlm_MAE)

MAE: 781.251 (723.733)
MAE: 736.697 (699.657)
MAE: 872.290 (741.162)
MAE: 848.118 (733.300)
MAE: 824.294 (728.993)
MAE: 753.715 (722.317)
MAE: 782.547 (718.692)


In [26]:
# Lasso Lars
# now try running for all feature datasets
# create variables for loop
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# LassoLars Models
cvll_MAE_list = []
for df in df2test:
    cvll_MAE = model_MAE.cvLassoLars(df, target, 1) 
    cvll_MAE_list.append(cvll_MAE)

MAE: 782.183 (722.924)
MAE: 739.965 (698.054)
MAE: 874.089 (739.091)
MAE: 847.785 (727.430)
MAE: 819.720 (718.520)
MAE: 745.523 (707.596)
MAE: 782.106 (719.369)


In [27]:
# Random Forest, send in x scaled, y train, and # estimators
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Random Forest Models
cvrf_MAE_list = []
for df in df2test:
    cvrf_MAE = model_MAE.cvRandomForest(df, target, 4) 
    cvrf_MAE_list.append(cvrf_MAE)

MAE: 789.115 (711.202)
MAE: 795.197 (730.630)
MAE: 886.140 (734.965)
MAE: 864.634 (763.147)
MAE: 814.246 (773.835)
MAE: 867.101 (723.110)
MAE: 970.368 (855.503)


In [28]:
# Tweedie Regressor, send in x scaled, y train, power and alpha settings
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Tweedie Regressor Models
cvtw_MAE_list = []
for df in df2test:
    cvtw_MAE = model_MAE.cvTweedie(df, target, 1.5, 1)
    cvtw_MAE_list.append(cvtw_MAE)


MAE: 784.059 (715.274)
MAE: 746.744 (694.291)
MAE: 892.440 (738.460)
MAE: 863.749 (732.989)
MAE: 815.728 (728.231)
MAE: 736.236 (703.981)
MAE: 785.083 (717.517)


In [29]:
# Support Vector Regressor, send in x scaled, y train, kernel = 'rbf' or 'linear'
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Support Vector Models
cvSVR_MAE_list = []
for df in df2test:
    cvSVR_MAE = model_MAE.cvSVR(df, target, 'rbf')
    cvSVR_MAE_list.append(cvSVR_MAE)


MAE: -931.598 (873.853)
MAE: -944.111 (877.222)
MAE: -950.422 (875.754)
MAE: -947.592 (876.159)
MAE: -945.986 (875.699)
MAE: -945.569 (875.402)
MAE: -933.094 (875.387)


In [30]:
# Support Vector Regressor, send in x scaled, y train, kernel = 'rbf' or 'linear'
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Support Vector Models
cvSVR_MAE_list = []
for df in df2test:
    cvSVR_MAE = model_MAE.cvSVR(df, target, 'linear')
    cvSVR_MAE_list.append(cvSVR_MAE)


MAE: -945.721 (875.750)
MAE: -945.987 (873.534)
MAE: -957.058 (875.437)
MAE: -955.889 (875.657)
MAE: -941.801 (870.728)
MAE: -918.582 (859.963)
MAE: -947.493 (877.127)


In [None]:
# linear regression Results table:
# linear regression Results table:
flat_list = []
for sublist in modelLR.coef_:
    for item in sublist:
        flat_list.append(item)

lr_result = pd.DataFrame()
x_train_columns = X_test_top4.columns.tolist()
lr_result['features'] = x_train_columns
lr_result['coefs'] = flat_list
lr_result['abs_coefs'] = abs(lr_result.coefs)
lr_result.sort_values(by='abs_coefs', ascending = False).reset_index()