In [1]:
import pandas as pd
import wrangle
import model

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt
from scipy import stats

In [2]:
df, train_exp, X_train_scaled, y_train, X_test_scaled, y_test = wrangle.wrangle_data()

Acquire: compiling raw data files...
Acquire: Completed!
Prepare: preparing data files...
Prepare: Completed!
(289, 25) (73, 25)


# Create Baseline

In [3]:
y_train.tract_cases_per_100k.mean(), y_train.tract_cases_per_100k.median()

(3221.15743847494, 2911.864043317428)

In [15]:
# using mean
mean_baseRMSE, basepred = model.get_baseline_mean(y_train)

Baseline RMSE: 1286.801141497012


In [16]:
# using median
median_baseRMSE, basepred = model.get_baseline_median(y_train)

Baseline RMSE: 1323.4498789323586


# Feature Ranking

In [6]:
# possible feature ranking?
rankdf = model.feature_ranking(X_train_scaled, y_train)
rankdf

Unnamed: 0_level_0,features
rank,Unnamed: 1_level_1
1,rank_svi_scaled
2,f_nohsdp_soci
3,f_minrty_status
4,f_groupq_trans
5,f_comp_total_scaled
6,f_unemp_soci
7,f_disabl_comp
8,f_noveh_trans
9,f_mobile_trans
10,f_age65_comp


# Create X_train_scaled df with selected features to test

## Feature groups to use for OLS, PF, and Tweedie models

In [7]:
# binned svi score by CDC range category = 1st ranked
X_rank_svi_only = X_train_scaled[['rank_svi_scaled']]
# top 4 ranked features
X_top5 = X_train_scaled[['rank_svi_scaled', 'f_nohsdp_soci', 'f_minrty_status', 'f_groupq_trans']]
# only the summary of the flags = 19th ranked
X_all_flags_only = X_train_scaled[['all_flags_total_scaled']]
# only summary flags, should be the same as all flags total? = 5th, 12th, 15th, 21st
X_summary_flags = X_train_scaled[['f_comp_total_scaled', 'f_soci_total_scaled', 'f_status_total_scaled', 'f_trans_total_scaled']]
# all individual flags
X_not_summary_flags = X_train_scaled[['f_nohsdp_soci', 'f_minrty_status', 'f_groupq_trans', 'f_unemp_soci', 
                                     'f_disabl_comp', 'f_noveh_trans', 'f_mobile_trans', 'f_age65_comp', 
                                     'f_age17_comp', 'f_pov_soci', 'f_limeng_status', 'f_crowd_trans', 
                                      'f_pci_soci', 'f_sngpnt_comp', 'f_munit_trans']]

## LassoLars
- includes feature selection as part of model so will use all features on this model

## Run all features on other models as well? -- RETURN HERE

## Drilling in features to test

- what is the score using only 1 summary flag at a time?
    - if one summary group is better might be worth investigating for which features within group have greatest impact?

# Run df through regression algorithms

In [8]:
# create variables for loop
df2test = [X_rank_svi_only, X_top5, X_all_flags_only, X_summary_flags, X_not_summary_flags, X_train_scaled]
target = y_train

# Linear Regression Models
lm_RMSE_list = []
for df in df2test:
    lm_RMSE = model.linear_reg_train(df, target)
    lm_RMSE_list.append(lm_RMSE)

# Polynomial Features 2 degree
pf2_RMSE_list = []
for df in df2test:
    pf2_RMSE = model.polynomial2(df, target)
    pf2_RMSE_list.append(pf2_RMSE)

# Tweedie power=0, alpha=.5
tweedie05 = []
for df in df2test:
    tw_RMSE = model.tweedie05(df, target)
    tweedie05.append(tw_RMSE)

# adjusted the hyperparameter of alpha from .1 to 1 with worse result
# adjusted again to alpha = .001 for these better results

In [9]:
# LassoLars Models
lars_RMSE = model.lasso_lars(X_train_scaled, target)

# Summarize in results df

In [18]:
# create dataframe for results of all train models
df_list = ['rank_svi_only', 'top5', 'total_all_flags_only', 'summary_flags', 'not_summary_flags', 'all_features']

results = pd.DataFrame(df_list, columns=['Features'])
results['LinearRegression_RMSE'] = lm_RMSE_list
results['Tweedie05_RMSE'] = tweedie05
results['PolynomialFeatures2_RMSE'] = pf2_RMSE_list
results['LassoLars_RMSE'] = lars_RMSE
results['Base_mean_RMSE'] = mean_baseRMSE
results['Base_median_RMSE'] = median_baseRMSE
results.sort_values('PolynomialFeatures2_RMSE')

Unnamed: 0,Features,LinearRegression_RMSE,Tweedie05_RMSE,PolynomialFeatures2_RMSE,LassoLars_RMSE,Base_mean_RMSE,Base_median_RMSE
5,all_features,966.672482,1101.718371,659.460849,969.625809,1286.801141,1323.449879
4,not_summary_flags,1046.886544,1170.546282,782.87832,969.625809,1286.801141,1323.449879
1,top5,1013.311,1157.782046,940.957308,969.625809,1286.801141,1323.449879
3,summary_flags,1121.696402,1246.761135,1054.411214,969.625809,1286.801141,1323.449879
0,rank_svi_only,1079.110092,1205.668551,1066.971168,969.625809,1286.801141,1323.449879
2,total_all_flags_only,1162.133618,1267.462807,1146.589076,969.625809,1286.801141,1323.449879
