In [695]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [696]:
df = pd.read_csv("./data/total_df.csv")

In [697]:
df.head()

Unnamed: 0,zip_code,Yearly_Avg_Zip,FL_Unemployment,year,Encoded_Zip,total_pop,households,male_pop,female_pop,median_age,...,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,FLSTHPI_Yearly_Avg
0,33160,1866.833333,7.533333,2013,73,37674,19247,18472,19202,49.6,...,4713.0,2773.0,7911.0,4713.0,935,1346,965,6471,2019,291.655
1,33025,1341.083333,7.533333,2013,41,57766,21206,27852,29914,32.9,...,8948.0,5448.0,10806.0,8948.0,2963,3682,3345,18163,4987,291.655
2,33139,1842.666667,7.533333,2013,61,38066,20883,22090,15976,38.1,...,4865.0,7164.0,9995.0,4865.0,685,649,379,5387,1906,291.655
3,32256,947.75,7.533333,2013,10,40024,18039,18865,21159,32.7,...,6247.0,2597.0,11070.0,6247.0,1638,1598,1629,10962,3429,291.655
4,33009,1443.416667,7.533333,2013,36,39889,19125,19256,20633,47.1,...,4958.0,3977.0,5087.0,4958.0,1335,1020,1249,6495,1885,291.655


In [698]:
df.year.unique()

array([2013, 2014, 2015, 2016, 2017, 2018])

In [699]:
df.shape

(645, 219)

In [700]:
df.columns.tolist()

['zip_code',
 'Yearly_Avg_Zip',
 'FL_Unemployment',
 'year',
 'Encoded_Zip',
 'total_pop',
 'households',
 'male_pop',
 'female_pop',
 'median_age',
 'male_under_5',
 'male_5_to_9',
 'male_10_to_14',
 'male_15_to_17',
 'male_18_to_19',
 'male_20',
 'male_21',
 'male_22_to_24',
 'male_25_to_29',
 'male_30_to_34',
 'male_35_to_39',
 'male_40_to_44',
 'male_45_to_49',
 'male_50_to_54',
 'male_55_to_59',
 'male_65_to_66',
 'male_67_to_69',
 'male_70_to_74',
 'male_75_to_79',
 'male_80_to_84',
 'male_85_and_over',
 'female_under_5',
 'female_5_to_9',
 'female_10_to_14',
 'female_15_to_17',
 'female_18_to_19',
 'female_20',
 'female_21',
 'female_22_to_24',
 'female_25_to_29',
 'female_30_to_34',
 'female_35_to_39',
 'female_40_to_44',
 'female_45_to_49',
 'female_50_to_54',
 'female_55_to_59',
 'female_60_to_61',
 'female_62_to_64',
 'female_65_to_66',
 'female_67_to_69',
 'female_70_to_74',
 'female_75_to_79',
 'female_80_to_84',
 'female_85_and_over',
 'population_1_year_and_over',
 'popu

# Remove Features That Can Cause Leakage

In [701]:
df=df.drop([
    'renter_occupied_housing_units_paying_cash_median_gross_rent', 
    'median_rent', 
    'percent_income_spent_on_rent', 
    'rent_burden_not_computed', 
    'rent_over_50_percent',
    'rent_40_to_50_percent',
    'rent_35_to_40_percent',
    'rent_30_to_35_percent',
    'rent_25_to_30_percent',
    'rent_20_to_25_percent',
    'rent_15_to_20_percent',
    'rent_10_to_15_percent',
    'rent_under_10_percent'], axis=1
)

## Get a list of duplicated columns

In [702]:
def getDuplicateColumns(df):
    '''
    Get a list of duplicate columns.
    It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
    :param df: Dataframe object
    :return: List of columns whose contents are duplicates.
    '''
    duplicateColumnNames = set()
    # Iterate over all the columns in dataframe
    for x in range(df.shape[1]):
        # Select column at xth index.
        col = df.iloc[:, x]
        # Iterate over all the columns in DataFrame from (x+1)th index till end
        for y in range(x + 1, df.shape[1]):
            # Select column at yth index.
            otherCol = df.iloc[:, y]
            # Check if two columns at x 7 y index are equal
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
    return list(duplicateColumnNames)
#from https://thispointer.com/how-to-find-drop-duplicate-columns-in-a-dataframe-python-pandas/

In [703]:
dup_cols=getDuplicateColumns(df)
dup_cols

['management_business_sci_arts_employed',
 'occupied_housing_units',
 'sales_office_employed']

In [704]:
df=df.drop(dup_cols, axis=1)

# Standardization/Standard Scaling

In [705]:
df_preprocess=df.drop(["zip_code", "year", "Yearly_Avg_Zip"], axis=1)

In [706]:
df_train=pd.DataFrame()

In [707]:
from sklearn.preprocessing import StandardScaler
standard=StandardScaler() #give each column the same range of values and centered around 0
standard.fit(df_preprocess)
df_train=pd.DataFrame(standard.transform(df_preprocess), 
                      index=df_preprocess.index, 
                      columns=df_preprocess.columns)

In [708]:
df_train.head()

Unnamed: 0,FL_Unemployment,Encoded_Zip,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,...,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,FLSTHPI_Yearly_Avg
0,2.005619,0.068493,0.094607,1.184796,0.11919,0.071695,1.608587,-0.790637,-0.602021,0.123111,...,-0.581888,-0.347586,0.028752,-0.462638,-0.731641,-0.28278,-0.695893,-0.481108,-0.180243,-1.675376
1,2.005619,-0.719515,1.416572,1.604604,1.419377,1.400375,-1.041677,1.69104,1.69819,2.346499,...,0.11839,0.925189,1.984141,0.905133,1.357065,2.146299,1.566783,1.933062,1.853966,-1.675376
2,2.005619,-0.22701,0.120399,1.535386,0.620691,-0.328448,-0.216445,-0.209889,-0.826005,-1.20014,...,-0.49773,-0.654671,0.098934,1.782551,-0.989125,-1.007552,-1.253006,-0.704932,-0.257691,-1.675376
3,2.005619,-1.482898,0.249227,0.925925,0.173665,0.314434,-1.073416,-0.04139,0.144594,0.165586,...,-0.562147,-0.720828,0.737032,-0.55263,-0.007598,-0.020739,-0.064626,0.446196,0.786143,-1.675376
4,2.005619,-0.842641,0.240344,1.158652,0.227863,0.249191,1.211841,0.169643,-0.322437,-0.373517,...,-0.214086,-0.419667,0.141874,0.152987,-0.319668,-0.62177,-0.425893,-0.476152,-0.272084,-1.675376


In [709]:
df_train["zip_code"]=df["zip_code"]
df_train["year"]=df["year"]
df_train["Yearly_Avg_Zip"]=df["Yearly_Avg_Zip"]

In [710]:
df_train.head()

Unnamed: 0,FL_Unemployment,Encoded_Zip,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,...,occupation_services,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,FLSTHPI_Yearly_Avg,zip_code,year,Yearly_Avg_Zip
0,2.005619,0.068493,0.094607,1.184796,0.11919,0.071695,1.608587,-0.790637,-0.602021,0.123111,...,-0.462638,-0.731641,-0.28278,-0.695893,-0.481108,-0.180243,-1.675376,33160,2013,1866.833333
1,2.005619,-0.719515,1.416572,1.604604,1.419377,1.400375,-1.041677,1.69104,1.69819,2.346499,...,0.905133,1.357065,2.146299,1.566783,1.933062,1.853966,-1.675376,33025,2013,1341.083333
2,2.005619,-0.22701,0.120399,1.535386,0.620691,-0.328448,-0.216445,-0.209889,-0.826005,-1.20014,...,1.782551,-0.989125,-1.007552,-1.253006,-0.704932,-0.257691,-1.675376,33139,2013,1842.666667
3,2.005619,-1.482898,0.249227,0.925925,0.173665,0.314434,-1.073416,-0.04139,0.144594,0.165586,...,-0.55263,-0.007598,-0.020739,-0.064626,0.446196,0.786143,-1.675376,32256,2013,947.75
4,2.005619,-0.842641,0.240344,1.158652,0.227863,0.249191,1.211841,0.169643,-0.322437,-0.373517,...,0.152987,-0.319668,-0.62177,-0.425893,-0.476152,-0.272084,-1.675376,33009,2013,1443.416667


# Feature Selection (Forward Stepwise)

In [711]:
def forward_selection(data, target, significance_level=0.05):
    import statsmodels.api as sm
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features
#from here 
#https://www.analyticsvidhya.com/blog/2020/10/a-comprehensive-guide-to-feature-selection-using-wrapper-methods-in-python/

In [712]:
forward_selected_features=forward_selection(df_train.drop(columns = 'Yearly_Avg_Zip'), 
                                            df_train.Yearly_Avg_Zip, 
                                            significance_level=0.05)

  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Seri

  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Seri

In [713]:
forward_selected_features=sorted(forward_selected_features)
forward_selected_features.extend(
    ["zip_code", "year", "Yearly_Avg_Zip"])

In [714]:
forward_selected_features

['FLSTHPI_Yearly_Avg',
 'FL_Unemployment',
 'children',
 'children_in_single_female_hh',
 'commute_10_14_mins',
 'commute_35_44_mins',
 'commute_45_59_mins',
 'commute_less_10_mins',
 'commuters_by_bus',
 'different_house_year_ago_same_city',
 'dwellings_20_to_49_units',
 'dwellings_50_or_more_units',
 'dwellings_5_to_9_units',
 'employed_arts_entertainment_recreation_accommodation_food',
 'employed_public_administration',
 'employed_transportation_warehousing_utilities',
 'female_50_to_54',
 'female_5_to_9',
 'female_60_to_61',
 'female_62_to_64',
 'female_65_to_66',
 'gini_index',
 'group_quarters',
 'households_public_asst_or_food_stamps',
 'housing_units',
 'in_grades_5_to_8',
 'income_100000_124999',
 'income_125000_149999',
 'income_150000_199999',
 'income_15000_19999',
 'income_200000_or_more',
 'income_30000_34999',
 'income_40000_44999',
 'income_50000_59999',
 'less_one_year_college',
 'male_15_to_17',
 'male_25_to_29',
 'male_35_to_39',
 'male_45_64_associates_degree',
 'ma

In [715]:
df_train_final=pd.DataFrame()

In [716]:
for feature in forward_selected_features:
    df_train_final[feature]=df_train[feature]

In [717]:
df_train_final.head()

Unnamed: 0,FLSTHPI_Yearly_Avg,FL_Unemployment,children,children_in_single_female_hh,commute_10_14_mins,commute_35_44_mins,commute_45_59_mins,commute_less_10_mins,commuters_by_bus,different_house_year_ago_same_city,...,pop_25_years_over,some_college_and_associates_degree,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,unemployed_pop,vacant_housing_units_for_rent,walked_to_work,zip_code,year,Yearly_Avg_Zip
0,-1.675376,2.005619,-0.604674,-0.787676,-0.226279,2.314692,0.775059,-0.283637,-0.042198,-0.761251,...,0.541535,0.383085,0.33959,-0.248349,-0.412005,1.612705,0.303524,33160,2013,1866.833333
1,-1.675376,2.005619,1.82154,1.470348,0.892614,2.283387,2.167135,0.289622,-0.120738,0.660936,...,1.213586,1.883253,0.692808,-0.21312,1.801816,2.143672,-0.503603,33025,2013,1341.083333
2,-1.675376,2.005619,-0.851691,-0.942912,3.052789,0.157246,-0.599016,3.642101,1.428648,1.816176,...,0.591249,0.157775,-0.02912,0.720441,-0.469948,2.936081,9.543007,33139,2013,1842.666667
3,-1.675376,2.005619,-0.009587,-0.380379,2.120162,-0.884955,-0.875031,1.67379,-0.677661,2.861721,...,0.201552,0.256348,1.538673,-0.036977,0.216179,0.569637,-0.41227,32256,2013,947.75
4,-1.675376,2.005619,-0.399941,-0.187782,-0.93254,0.564213,0.041019,-0.704511,0.329083,0.312677,...,0.647456,0.471171,-0.317272,0.051095,1.025553,0.014412,-0.225357,33009,2013,1443.416667


In [718]:
df_2013_to_2017=df_train_final[df_train_final["year"].isin(
    [2013,2014,2015,2016,2017])]

In [719]:
df_2018=df_train_final[df_train_final["year"]==2018]

In [720]:
df_2013_to_2017.shape

(518, 61)

In [721]:
df_2018.shape

(127, 61)

In [724]:
df_check=df_train_final.drop(['zip_code','year','Yearly_Avg_Zip'], axis=1)

In [725]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def get_vif(df):
    vif = pd.DataFrame()
    vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif['variable'] = df.columns
    return vif.sort_values("VIF", ascending=False)   
#writing a function to get VIF among remaining features 
# using VIF to confirm if there's still multicollinearity exists 

In [727]:
get_vif(df_check)

Unnamed: 0,VIF,variable
51,268.509029,pop_25_years_over
2,213.219181,children
44,86.664728,married_households
49,75.945527,owner_occupied_housing_units
52,60.709623,some_college_and_associates_degree
46,59.313197,mortgaged_housing_units
0,53.497692,FLSTHPI_Yearly_Avg
1,52.6559,FL_Unemployment
3,38.13353,children_in_single_female_hh
48,37.669014,occupation_sales_office


## Linear Models on the 4 Biggest Metro Areas in FL

In [728]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV


In [729]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

In [730]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression 

In [731]:
%store -r miami_zip
%store -r orlando_zip
%store -r tampa_zip
%store -r jax_zip

## Miami

In [732]:
df_2013_to_2017_miami=df_2013_to_2017[
    df_2013_to_2017["zip_code"].isin(miami_zip)]

df_2013_to_2017_miami=df_2013_to_2017_miami.drop(["zip_code","year"],axis=1)
#dropping unique identifiers

In [733]:
df_2018_miami=df_2018[df_2018["zip_code"].isin(miami_zip)]
df_2018_miami=df_2018_miami.drop(["zip_code","year"],axis=1)

### Linear Regression

In [734]:
ols_miami = linear_model.LinearRegression()
ols_miami.fit(df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_miami.Yearly_Avg_Zip)

LinearRegression()

In [735]:
ols_miami.score(df_2018_miami.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_miami.Yearly_Avg_Zip)

0.7440131637259504

In [736]:
rmse_test_ols_miami=np.sqrt(
    mean_squared_error
    (df_2018_miami.Yearly_Avg_Zip, 
    ols_miami.predict(df_2018_miami.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_ols_miami)

122.1203309008359


### Lasso Regression

In [737]:
lasso_miami=Lasso()

In [738]:
lasso_miami.fit(df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_miami.Yearly_Avg_Zip)

  model = cd_fast.enet_coordinate_descent(


Lasso()

In [739]:
rmse_test_lasso_miami=np.sqrt(
    mean_squared_error
    (df_2018_miami.Yearly_Avg_Zip, 
    lasso_miami.predict(df_2018_miami.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_miami)

140.25559124705813


In [None]:
from sklearn.model_selection import GridSearchCV

lasso_params = {
    "alpha": list(np.logspace(-8,2,11))
    ,"max_iter": [10, 100, 1000, 10000]
    , "tol": list(np.logspace(-8,0,9))
    
}

lasso_miami_gs = GridSearchCV(lasso_miami, lasso_params, cv=kfold)
lasso_miami_gs.fit(df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_miami.Yearly_Avg_Zip)
print("The best parameters are: ", lasso_miami_gs.best_params_)
lasso_miami_gs.cv_results_['mean_test_score']

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [None]:
print("The best parameters are: ", lasso_miami_gs.best_params_)

In [None]:
lasso_miami_gs.score(df_2018_miami.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_miami.Yearly_Avg_Zip)

In [None]:
lasso_miami_tuned=Lasso(**lasso_miami_gs.best_params_)

In [None]:
lasso_miami_tuned.fit(df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_miami.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_miami=np.sqrt(
    mean_squared_error
    (df_2018_miami.Yearly_Avg_Zip, 
    lasso_miami_tuned.predict(df_2018_miami.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_miami)

In [None]:
df_2018_miami.Yearly_Avg_Zip.describe()

### Cross Validation on the train set (data in 2013-2017)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
    df_2013_to_2017_miami.Yearly_Avg_Zip, 
    test_size=0.3, random_state=0)

In [None]:
lasso_miami_tuned_scores = cross_val_score(lasso_miami_tuned, 
                                           X_train, y_train, cv=kfold)
print(lasso_miami_tuned_scores)
np.mean(lasso_miami_tuned_scores)

In [None]:
lasso_miami_tuned.score(X_test, y_test)

### Random Forest

In [None]:
rf_miami = RandomForestRegressor(random_state=0)
rf_miami.fit(df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip'), 
             df_2013_to_2017_miami.Yearly_Avg_Zip)

In [None]:
feature_importances_miami = pd.Series(rf_miami.feature_importances_, 
                                index=df_2013_to_2017_miami.drop(columns = 'Yearly_Avg_Zip').columns)
feature_importances_miami=feature_importances_miami.sort_values(ascending=False)
feature_importances_miami.head(10)

## Orlando

In [None]:
df_2013_to_2017_orlando=df_2013_to_2017[
    df_2013_to_2017["zip_code"].isin(orlando_zip)]

df_2013_to_2017_orlando=df_2013_to_2017_orlando.drop(["zip_code","year"],axis=1)
#dropping unique identifiers

In [None]:
df_2018_orlando=df_2018[df_2018["zip_code"].isin(orlando_zip)]
df_2018_orlando=df_2018_orlando.drop(["zip_code","year"],axis=1)

## Linear Regression

In [None]:
ols_orlando = linear_model.LinearRegression()
ols_orlando.fit(df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_orlando.Yearly_Avg_Zip)

In [None]:
ols_orlando.score(df_2018_orlando.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_orlando.Yearly_Avg_Zip)

In [None]:
rmse_test_ols_orlando=np.sqrt(
    mean_squared_error
    (df_2018_orlando.Yearly_Avg_Zip, 
    ols_orlando.predict(df_2018_orlando.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_ols_orlando)

## Lasso Regression

In [None]:
lasso_orlando=Lasso()

In [None]:
lasso_orlando.fit(df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_orlando.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_orlando=np.sqrt(
    mean_squared_error
    (df_2018_orlando.Yearly_Avg_Zip, 
    lasso_orlando.predict(df_2018_orlando.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_orlando)

In [None]:
from sklearn.model_selection import GridSearchCV

lasso_params = {
    "alpha": list(np.logspace(-8,2,11))
    ,"max_iter": [10, 100, 1000, 10000]
    , "tol": list(np.logspace(-8,0,9))
    
}

lasso_orlando_gs = GridSearchCV(lasso_orlando, lasso_params, cv=kfold)
lasso_orlando_gs.fit(df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_orlando.Yearly_Avg_Zip)
lasso_orlando_gs.cv_results_['mean_test_score']

In [None]:
print("The best parameters are: ", lasso_orlando_gs.best_params_)

In [None]:
lasso_orlando_gs.score(df_2018_orlando.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_orlando.Yearly_Avg_Zip)

In [None]:
lasso_orlando_tuned=Lasso(**lasso_orlando_gs.best_params_)

In [None]:
lasso_orlando_tuned.fit(df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_orlando.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_orlando=np.sqrt(
    mean_squared_error
    (df_2018_orlando.Yearly_Avg_Zip, 
    lasso_orlando_tuned.predict(df_2018_orlando.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_orlando)

### Cross Validation on the train set (data in 2013-2017)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
    df_2013_to_2017_orlando.Yearly_Avg_Zip, 
    test_size=0.3, random_state=0)

In [None]:
lasso_orlando_tuned_scores = cross_val_score(lasso_orlando_tuned, 
                                           X_train, y_train, cv=kfold)
print(lasso_orlando_tuned_scores)
np.mean(lasso_orlando_tuned_scores)

In [None]:
lasso_orlando_tuned.score(X_test, y_test)

### Random Forest

In [None]:
rf_orlando = RandomForestRegressor(random_state=0)
rf_orlando.fit(df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip'), 
             df_2013_to_2017_orlando.Yearly_Avg_Zip)

In [None]:
feature_importances_orlando = pd.Series(rf_orlando.feature_importances_, 
                                index=df_2013_to_2017_orlando.drop(columns = 'Yearly_Avg_Zip').columns)
feature_importances_orlando=feature_importances_orlando.sort_values(ascending=False)
feature_importances_orlando.head(10)

## Tampa

In [None]:
df_2013_to_2017_tampa=df_2013_to_2017[
    df_2013_to_2017["zip_code"].isin(tampa_zip)]

df_2013_to_2017_tampa=df_2013_to_2017_tampa.drop(["zip_code","year"],axis=1)
#dropping unique identifiers

In [None]:
df_2018_tampa=df_2018[df_2018["zip_code"].isin(tampa_zip)]
df_2018_tampa=df_2018_tampa.drop(["zip_code","year"],axis=1)

### Linear Regression

In [None]:
ols_tampa = linear_model.LinearRegression()
ols_tampa.fit(df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_tampa.Yearly_Avg_Zip)

In [None]:
ols_tampa.score(df_2018_tampa.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_tampa.Yearly_Avg_Zip)

In [None]:
rmse_test_ols_tampa=np.sqrt(
    mean_squared_error
    (df_2018_tampa.Yearly_Avg_Zip, 
    ols_tampa.predict(df_2018_tampa.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_ols_tampa)

### Lasso Regression

In [None]:
lasso_tampa=Lasso()

In [None]:
lasso_tampa.fit(df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_tampa.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_tampa=np.sqrt(
    mean_squared_error
    (df_2018_tampa.Yearly_Avg_Zip, 
    lasso_tampa.predict(df_2018_tampa.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_tampa)

In [None]:
from sklearn.model_selection import GridSearchCV

lasso_params = {
    "alpha": list(np.logspace(-8,2,11))
    ,"max_iter": [10, 100, 1000, 10000]
    , "tol": list(np.logspace(-8,0,9))
    
}

lasso_tampa_gs = GridSearchCV(lasso_tampa, lasso_params, cv=kfold)
lasso_tampa_gs.fit(df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_tampa.Yearly_Avg_Zip)
print("The best parameters are: ", lasso_tampa_gs.best_params_)
lasso_tampa_gs.cv_results_['mean_test_score']

In [None]:
print("The best parameters are: ", lasso_tampa_gs.best_params_)

In [None]:
lasso_tampa_gs.score(df_2018_tampa.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_tampa.Yearly_Avg_Zip)

In [None]:
lasso_tampa_tuned=Lasso(**lasso_tampa_gs.best_params_)

In [None]:
lasso_tampa_tuned.fit(df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_tampa.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_tampa=np.sqrt(
    mean_squared_error
    (df_2018_tampa.Yearly_Avg_Zip, 
    lasso_tampa_tuned.predict(df_2018_tampa.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_tampa)

### Cross Validation on the train set (data in 2013-2017)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
    df_2013_to_2017_tampa.Yearly_Avg_Zip, 
    test_size=0.3, random_state=0)

In [None]:
lasso_tampa_tuned_scores = cross_val_score(lasso_tampa_tuned, 
                                           X_train, y_train, cv=kfold)
print(lasso_tampa_tuned_scores)
np.mean(lasso_tampa_tuned_scores)

In [None]:
lasso_tampa_tuned.score(X_test, y_test)

### Random Forest

In [None]:
rf_tampa = RandomForestRegressor(random_state=0)
rf_tampa.fit(df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip'), 
             df_2013_to_2017_tampa.Yearly_Avg_Zip)

In [None]:
feature_importances_tampa = pd.Series(rf_tampa.feature_importances_, 
                                index=df_2013_to_2017_tampa.drop(columns = 'Yearly_Avg_Zip').columns)
feature_importances_tampa=feature_importances_tampa.sort_values(ascending=False)
feature_importances_tampa.head(10)

## Jacksonville

In [None]:
df_2013_to_2017_jax=df_2013_to_2017[
    df_2013_to_2017["zip_code"].isin(jax_zip)]

df_2013_to_2017_jax=df_2013_to_2017_jax.drop(["zip_code","year"],axis=1)
#dropping unique identifiers

In [None]:
df_2018_jax=df_2018[df_2018["zip_code"].isin(jax_zip)]
df_2018_jax=df_2018_jax.drop(["zip_code","year"],axis=1)

### Linear Regression

In [None]:
ols_jax = linear_model.LinearRegression()
ols_jax.fit(df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_jax.Yearly_Avg_Zip)

In [None]:
ols_jax.score(df_2018_jax.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_jax.Yearly_Avg_Zip)

In [None]:
rmse_test_ols_jax=np.sqrt(
    mean_squared_error
    (df_2018_jax.Yearly_Avg_Zip, 
    ols_jax.predict(df_2018_jax.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_ols_jax)

### Lasso Regression

In [None]:
lasso_jax=Lasso()

In [None]:
lasso_jax.fit(df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_jax.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_jax=np.sqrt(
    mean_squared_error
    (df_2018_jax.Yearly_Avg_Zip, 
    lasso_jax.predict(df_2018_jax.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_jax)

In [None]:
from sklearn.model_selection import GridSearchCV

lasso_params = {
    "alpha": list(np.logspace(-8,2,11))
    ,"max_iter": [10, 100, 1000, 10000]
    , "tol": list(np.logspace(-8,0,9))
    
}

lasso_jax_gs = GridSearchCV(lasso_jax, lasso_params, cv=kfold)
lasso_jax_gs.fit(df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_jax.Yearly_Avg_Zip)
print("The best parameters are: ", lasso_jax_gs.best_params_)
lasso_jax_gs.cv_results_['mean_test_score']

In [None]:
print("The best parameters are: ", lasso_jax_gs.best_params_)

In [None]:
lasso_jax_gs.score(df_2018_jax.drop(columns = 'Yearly_Avg_Zip'), 
               df_2018_jax.Yearly_Avg_Zip)

In [None]:
lasso_jax_tuned=Lasso(**lasso_jax_gs.best_params_)

In [None]:
lasso_jax_tuned.fit(df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
              df_2013_to_2017_jax.Yearly_Avg_Zip)

In [None]:
rmse_test_lasso_jax=np.sqrt(
    mean_squared_error
    (df_2018_jax.Yearly_Avg_Zip, 
    lasso_jax_tuned.predict(df_2018_jax.drop(columns = 'Yearly_Avg_Zip'))
    )
)
print(rmse_test_lasso_jax)

### Cross Validation on the train set (data in 2013-2017)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
    df_2013_to_2017_jax.Yearly_Avg_Zip, 
    test_size=0.3, random_state=0)

In [None]:
lasso_jax_tuned_scores = cross_val_score(lasso_jax_tuned, 
                                           X_train, y_train, cv=kfold)
print(lasso_jax_tuned_scores)
np.mean(lasso_jax_tuned_scores)

In [None]:
lasso_jax_tuned.score(X_test, y_test)

### Random Forest

In [None]:
rf_jax = RandomForestRegressor(random_state=0)
rf_jax.fit(df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip'), 
             df_2013_to_2017_jax.Yearly_Avg_Zip)

In [None]:
feature_importances_jax = pd.Series(rf_jax.feature_importances_, 
                                index=df_2013_to_2017_jax.drop(columns = 'Yearly_Avg_Zip').columns)
feature_importances_jax=feature_importances_jax.sort_values(ascending=False)
feature_importances_jax.head(10)