In [33]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV

sns.set_palette('colorblind')

In [2]:
df = pd.read_csv('../data/total_df.csv')

In [3]:
len(df.columns)

238

In [4]:
for col in df.columns: print(col)

zip_code
Yearly_Avg_Zip
FL_Unemployment
year
Encoded_Zip
total_pop
households
male_pop
female_pop
median_age
male_under_5
male_5_to_9
male_10_to_14
male_15_to_17
male_18_to_19
male_20
male_21
male_22_to_24
male_25_to_29
male_30_to_34
male_35_to_39
male_40_to_44
male_45_to_49
male_50_to_54
male_55_to_59
male_65_to_66
male_67_to_69
male_70_to_74
male_75_to_79
male_80_to_84
male_85_and_over
female_under_5
female_5_to_9
female_10_to_14
female_15_to_17
female_18_to_19
female_20
female_21
female_22_to_24
female_25_to_29
female_30_to_34
female_35_to_39
female_40_to_44
female_45_to_49
female_50_to_54
female_55_to_59
female_60_to_61
female_62_to_64
female_65_to_66
female_67_to_69
female_70_to_74
female_75_to_79
female_80_to_84
female_85_and_over
white_pop
population_1_year_and_over
population_3_years_over
pop_5_years_over
pop_16_over
pop_25_years_over
pop_25_64
not_us_citizen_pop
black_pop
asian_pop
hispanic_pop
amerindian_pop
other_race_pop
two_or_more_races_pop
hispanic_any_race
not_hispanic_

In [5]:
df.head()

Unnamed: 0,zip_code,Yearly_Avg_Zip,FL_Unemployment,year,Encoded_Zip,total_pop,households,male_pop,female_pop,median_age,...,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,FLSTHPI_Yearly_Avg
0,33160,1988.0,5.525,2015,73,39235,19187,19087,20148,48.8,...,5554.0,1312,994,1283,6732,1737,12501.0,15531.0,7593.0,348.4425
1,33025,1433.25,5.525,2015,41,60994,21770,28737,32257,33.4,...,9833.0,3150,3155,3604,18386,5038,31916.0,16088.0,6345.0,348.4425
2,33139,2046.25,5.525,2015,61,38035,20149,20756,17279,38.8,...,5834.0,922,690,491,5580,1745,12882.0,17215.0,9157.0,348.4425
3,33024,1356.083333,5.525,2015,40,71860,22838,35172,36688,37.6,...,11047.0,3785,3863,3637,19756,5345,33887.0,27660.0,12738.0,348.4425
4,32256,1074.833333,5.525,2015,10,41609,18606,19462,22147,33.7,...,6483.0,1854,1751,1887,10474,2926,28801.0,3365.0,1401.0,348.4425


In [8]:
df.year

0      2015
1      2015
2      2015
3      2015
4      2015
       ... 
475    2018
476    2018
477    2018
478    2018
479    2018
Name: year, Length: 480, dtype: int64

In [19]:
cols = ['white_pop', \
        'black_pop', \
        'asian_pop', \
        'hispanic_pop', \
        'amerindian_pop', \
        'other_race_pop', \
        'two_or_more_races_pop', \
        'hispanic_any_race', \
        'not_hispanic_pop', \
        'asian_male_45_54', \
        'asian_male_55_64', \
        'black_male_45_54', \
        'black_male_55_64', \
        'hispanic_male_45_54', \
        'hispanic_male_55_64', \
        'white_male_45_54', \
        'white_male_55_64', \
        'speak_only_english_at_home', \
        'speak_spanish_at_home', \
        'speak_spanish_at_home_low_english']
no_race = df.drop(columns = cols)

In [20]:
df_2015, df_2016, df_2017, df_2018 = (no_race[no_race.year == year] for year in no_race.year.unique())

In [21]:
for col in df_2015.columns: print(col)

zip_code
Yearly_Avg_Zip
FL_Unemployment
year
Encoded_Zip
total_pop
households
male_pop
female_pop
median_age
male_under_5
male_5_to_9
male_10_to_14
male_15_to_17
male_18_to_19
male_20
male_21
male_22_to_24
male_25_to_29
male_30_to_34
male_35_to_39
male_40_to_44
male_45_to_49
male_50_to_54
male_55_to_59
male_65_to_66
male_67_to_69
male_70_to_74
male_75_to_79
male_80_to_84
male_85_and_over
female_under_5
female_5_to_9
female_10_to_14
female_15_to_17
female_18_to_19
female_20
female_21
female_22_to_24
female_25_to_29
female_30_to_34
female_35_to_39
female_40_to_44
female_45_to_49
female_50_to_54
female_55_to_59
female_60_to_61
female_62_to_64
female_65_to_66
female_67_to_69
female_70_to_74
female_75_to_79
female_80_to_84
female_85_and_over
population_1_year_and_over
population_3_years_over
pop_5_years_over
pop_16_over
pop_25_years_over
pop_25_64
not_us_citizen_pop
median_income
income_per_capita
income_less_10000
income_10000_14999
income_15000_19999
income_20000_24999
income_25000_29999


In [25]:
df_lst = [df_2015, df_2016, df_2017, df_2018]

In [36]:
df.shape

(112, 218)

In [34]:
models = []
params = {'bootstrap': [True, False], \
          'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], \
          'max_features': ['auto', 'sqrt'], \
          'min_samples_leaf': [1, 2, 4], \
          'min_samples_split': [2, 5, 10], \
          'n_estimators': np.arange(200, 2000, 200)}

for df in df_lst:
    x_train, x_test, y_train, y_test = train_test_split(df.drop(columns = 'Yearly_Avg_Zip'), df.Yearly_Avg_Zip)
    grid = GridSearchCV(RandomForestRegressor(), params, cv = 5)
    grid.fit(x_train, y_train)
    models.append({'model': grid.best_estimator_, \
                   'train_score': cross_val_score(grid.best_estimator_, x_train, y_train), \
                   'test_score': grid.best_estimator_.score(x_test, y_test)})

forest_2015, forest_2016, forest_2017, forest_2018 = models

KeyboardInterrupt: 

In [None]:
forest_2015['test_score']