In [25]:
import pandas as pd
import wrangle
import model
import model_MAE

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt
from scipy import stats

In [26]:
df, train_exp, X_train_scaled, y_train, X_test_scaled, y_test = wrangle.wrangle_data()

Acquire: compiling raw data files...
Acquire: Completed!
Prepare: preparing data files...
Prepare: Completed!
(289, 25) (73, 25)


In [27]:
X_train_scaled.columns

Index(['raw_svi', 'f_pov_soci', 'f_unemp_soci', 'f_pci_soci', 'f_nohsdp_soci',
       'f_age65_comp', 'f_age17_comp', 'f_disabl_comp', 'f_sngpnt_comp',
       'f_minrty_status', 'f_limeng_status', 'f_munit_trans', 'f_mobile_trans',
       'f_crowd_trans', 'f_noveh_trans', 'f_groupq_trans',
       'f_soci_total_scaled', 'f_comp_total_scaled', 'f_status_total_scaled',
       'f_trans_total_scaled', 'all_flags_total_scaled', 'rank_svi_scaled'],
      dtype='object')

# Create Baseline

In [28]:
y_train.tract_cases_per_100k.mean(), y_train.tract_cases_per_100k.median()

(3221.15743847494, 2911.864043317428)

In [29]:
# MAE
mean_baseMAE, basepred1 = model_MAE.get_baseline_mean(y_train)

Baseline MAE: 991.6409213845292


In [30]:
# using mean
mean_baseRMSE, basepred = model.get_baseline_mean(y_train)

Baseline RMSE: 1286.801141497012


In [31]:
# using median
median_baseMAE, basepred2 = model_MAE.get_baseline_median(y_train)

Baseline MAE: 972.441941022456


In [32]:
# using median
median_baseRMSE, basepred = model.get_baseline_median(y_train)

Baseline RMSE: 1323.4498789323586


# Feature Ranking

In [33]:
# possible feature ranking?
rankdf = model.feature_ranking(X_train_scaled, y_train)
rankdf

Unnamed: 0_level_0,features
rank,Unnamed: 1_level_1
1,rank_svi_scaled
2,f_nohsdp_soci
3,f_minrty_status
4,f_groupq_trans
5,f_comp_total_scaled
6,f_unemp_soci
7,f_disabl_comp
8,f_noveh_trans
9,f_mobile_trans
10,f_age65_comp


# Create X_train_scaled df with selected features to test

## Feature groups to use for OLS, PF, and Tweedie models

In [34]:
# only raw svi score
X_raw_svi = X_train_scaled[['raw_svi']]
# binned svi score by CDC range category = 1st ranked
X_rank_svi_only = X_train_scaled[['rank_svi_scaled']]
# top 4 ranked features
X_top4 = X_train_scaled[['rank_svi_scaled', 'f_nohsdp_soci', 'f_minrty_status', 'f_groupq_trans']]
# only the summary of the flags = 19th ranked
X_all_flags_only = X_train_scaled[['all_flags_total_scaled']]
# only summary flags, should be the same as all flags total? = 5th, 12th, 15th, 21st
X_summary_flags = X_train_scaled[['f_comp_total_scaled', 'f_soci_total_scaled', 'f_status_total_scaled', 'f_trans_total_scaled']]
# all individual flags
X_not_summary_flags = X_train_scaled[['f_nohsdp_soci', 'f_minrty_status', 'f_groupq_trans', 'f_unemp_soci', 
                                     'f_disabl_comp', 'f_noveh_trans', 'f_mobile_trans', 'f_age65_comp', 
                                     'f_age17_comp', 'f_pov_soci', 'f_limeng_status', 'f_crowd_trans', 
                                      'f_pci_soci', 'f_sngpnt_comp', 'f_munit_trans']]

## LassoLars
- includes feature selection as part of model so will use all features on this model

## Drilling in features to test -- RETURN HERE

- what is the score using only 1 summary flag at a time?
    - if one summary group is better might be worth investigating for which features within group have greatest impact?

# Run df through regression algorithms

In [35]:
# create variables for loop
df2test = [X_rank_svi_only, X_top4, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled, X_raw_svi]
target = y_train

# Linear Regression Models
lm_MAE_list = []
for df in df2test:
    lm_MAE = model_MAE.linear_reg_train(df, target)
    lm_MAE_list.append(lm_MAE)

# Polynomial Features 2 degree
pf2_MAE_list = []
for df in df2test:
    pf2_MAE = model_MAE.polynomial2(df, target)
    pf2_MAE_list.append(pf2_MAE)

# Tweedie power=0, alpha=.5
tweedie05 = []
for df in df2test:
    tw_MAE = model_MAE.tweedie05(df, target)
    tweedie05.append(tw_MAE)


In [36]:
# # create variables for loop
# df2test = [X_rank_svi_only, X_top5, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled]
# target = y_train

# # Linear Regression Models
# lm_RMSE_list = []
# for df in df2test:
#     lm_RMSE = model.linear_reg_train(df, target)
#     lm_RMSE_list.append(lm_RMSE)

# # Polynomial Features 2 degree
# pf2_RMSE_list = []
# for df in df2test:
#     pf2_RMSE = model.polynomial2(df, target)
#     pf2_RMSE_list.append(pf2_RMSE)

# # Tweedie power=0, alpha=.5
# tweedie05 = []
# for df in df2test:
#     tw_RMSE = model.tweedie05(df, target)
#     tweedie05.append(tw_RMSE)


In [37]:
# LassoLars Models
lars_MAE = model_MAE.lasso_lars(X_train_scaled, target)

In [38]:
# LassoLars Models
lars_RMSE = model.lasso_lars(X_train_scaled, target)

# Summarize in results df

In [47]:
# create dataframe for results of all train models
df_list = ['rank_svi_only', 'top4', 'total_all_flags_only', 'summary_flags', 'not_summary_flags', 'all_features', 'raw_svi_only']

results = pd.DataFrame(df_list, columns=['Features'])
results['LinearRegression_MAE'] = lm_MAE_list
results['Tweedie05_MAE'] = tweedie05
results['PolynomialFeatures2_MAE'] = pf2_MAE_list
results['LassoLars_MAE'] = lars_MAE
results['Base_mean_MAE'] = mean_baseMAE
results['Base_median_MAE'] = median_baseMAE
results.sort_values('PolynomialFeatures2_MAE')

Unnamed: 0,Features,LinearRegression_MAE,Tweedie05_MAE,PolynomialFeatures2_MAE,LassoLars_MAE,Base_mean_MAE,Base_median_MAE
5,all_features,694.465209,827.63747,488.669323,743.214111,991.640921,972.441941
4,not_summary_flags,786.881614,900.81027,528.357985,743.214111,991.640921,972.441941
1,top4,716.161912,885.014499,710.919196,743.214111,991.640921,972.441941
6,raw_svi_only,781.75109,942.005025,772.131633,743.214111,991.640921,972.441941
0,rank_svi_only,783.145392,917.511392,781.326882,743.214111,991.640921,972.441941
3,summary_flags,823.656119,962.109229,785.635693,743.214111,991.640921,972.441941
2,total_all_flags_only,857.440941,976.4314,857.232422,743.214111,991.640921,972.441941


In [40]:
# # create dataframe for results of all train models
# df_list = ['rank_svi_only', 'top5', 'total_all_flags_only', 'summary_flags', 'not_summary_flags', 'all_features', 'raw_svi_only']

# results = pd.DataFrame(df_list, columns=['Features'])
# results['LinearRegression_RMSE'] = lm_RMSE_list
# results['Tweedie05_RMSE'] = tweedie05
# results['PolynomialFeatures2_RMSE'] = pf2_RMSE_list
# results['LassoLars_RMSE'] = lars_RMSE
# results['Base_mean_RMSE'] = mean_baseRMSE
# results['Base_median_RMSE'] = median_baseRMSE
# results.sort_values('PolynomialFeatures2_RMSE')

# Validate Stage

## pick models to validate
- LassoLars
- LR all feature
- PF2 all features
- PF2 top5


In [41]:
from numpy import mean
from numpy import std, absolute
from sklearn.datasets import make_blobs
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error

In [42]:
# LassoLars
# create loocv procedure
cvLL = LeaveOneOut()
# create model
modelLL = LassoLars(alpha=1)
# evaluate model
scoresLL = cross_val_score(modelLL, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=cvLL, n_jobs=-1)
# force positive
scoresLL = absolute(scoresLL)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresLL), std(scoresLL)))

MAE: 743.953 (739.219)


In [43]:
# Linear Regression - all features
# create loocv procedure
cvLR = LeaveOneOut()
# create model
modelLR = LinearRegression()
# evaluate model
scoresLR = cross_val_score(modelLR, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=cvLR, n_jobs=-1)
# force positive
scoresLR = absolute(scoresLR)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))

MAE: 751.477 (743.994)


In [44]:
# Polynomial Features 2 degrees - all features
# create loocv procedure
cvPF1 = LeaveOneOut()

# create PF object
pf1 = PolynomialFeatures(degree=2)
# Fit and Transform model to get a new set of features...which are the original features squared
X_train_squared1 = pf1.fit_transform(X_train_scaled)

# create model
modelPF1 = LinearRegression(normalize=True)
# evaluate model
scoresPF1 = cross_val_score(modelPF1, X_train_squared1, y_train, scoring='neg_mean_absolute_error', cv=cvPF1, n_jobs=-1)
# force positive
scoresPF1 = absolute(scoresPF1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresPF1), std(scoresPF1)))

MAE: 611800868122157.250 (3405146298058511.500)


In [45]:
# Polynomial Features 2 degrees - top 5
# create loocv procedure
cvPF2 = LeaveOneOut()

# create PF object
pf2 = PolynomialFeatures(degree=2)
# Fit and Transform model to get a new set of features...which are the original features squared
X_train_squared2 = pf2.fit_transform(X_top4)

# create model
modelPF2 = LinearRegression(normalize=True)
# evaluate model
scoresPF2 = cross_val_score(modelPF2, X_train_squared2, y_train, scoring='neg_mean_absolute_error', cv=cvPF2, n_jobs=-1)
# force positive
scoresPF2 = absolute(scoresPF2)
# report performance
print('MAE: %.3f (%.3f)' % (mean(scoresPF2), std(scoresPF2)))

MAE: 753.841 (672.284)


# Test 

In [46]:
# fit model from cross validation to train dataset
modelLL.fit(X_train_scaled, y_train)
# use the trained dataset to predict on test
lars_test_pred = modelLL.predict(X_test_scaled)
# get the MAE of test
lars_MAE = mean_absolute_error(y_test, lars_test_pred)
print('MAE: %.3f' % lars_MAE)

MAE: 743.214
