In [68]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform

from PlottingFunctions import plot_inertia, plot_silhouette

In [85]:
total_df = pd.read_csv('../data/total_df.csv')
total_df_col_set = set(total_df.columns)


In [70]:
zips = total_df['zip_code'].unique()

In [71]:
total_df['Monthly_Avg_Zip'].eq(0).value_counts()

False    645
Name: Monthly_Avg_Zip, dtype: int64

In [72]:

year_begin, year_end = 2013, 2018
pct_change = lambda a, b: ((b-a) / a)
pct_change_dict = {}

for z in zips:
    a_df = total_df[ total_df['zip_code'] == z ]
    pct_change_array = []
    for y in range( year_begin, year_end):
        a, b = a_df[a_df['year'] == y]['Monthly_Avg_Zip'].sum(), a_df[a_df['year'] == y+1]['Monthly_Avg_Zip'].sum()
        
        if a > 0 and b > 0:
            pct_change_array.append( pct_change(a, b) )
        
        if len(pct_change_array) > 0:
            pct_change_dict[z] = np.median(pct_change_array)



In [89]:
mpi_df = pd.DataFrame.from_dict(pct_change_dict, orient='index')
mpi_df.reset_index(inplace=True)
mpi_df = temp.rename(columns = {'index':'zip_code', 0:'median_pct_increase' })
mpi_df

Unnamed: 0,zip_code,median_pct_increase
0,33160,0.022971
1,33025,0.021003
2,33139,0.020404
3,32256,0.027058
4,33009,0.028284
...,...,...
118,32765,0.051919
119,33578,0.022998
120,34747,0.061024
121,33605,0.275356


In [92]:
total_df = pd.merge(total_df,mpi_df, how='left', on='zip_code')
total_df

Unnamed: 0,zip_code,Monthly_Avg_Zip,FL_Unemployment,year,Encoded_Zip,total_pop,households,male_pop,female_pop,median_age,...,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,FLSTHPI_Yearly_Avg,median_pct_increase
0,33160,1866.833333,7.533333,2013,73,37674,19247,18472,19202,49.6,...,2773.0,7911.0,4713.0,935,1346,965,6471,2019,291.6550,0.022971
1,33025,1341.083333,7.533333,2013,41,57766,21206,27852,29914,32.9,...,5448.0,10806.0,8948.0,2963,3682,3345,18163,4987,291.6550,0.021003
2,33139,1842.666667,7.533333,2013,61,38066,20883,22090,15976,38.1,...,7164.0,9995.0,4865.0,685,649,379,5387,1906,291.6550,0.020404
3,32256,947.750000,7.533333,2013,10,40024,18039,18865,21159,32.7,...,2597.0,11070.0,6247.0,1638,1598,1629,10962,3429,291.6550,0.027058
4,33009,1443.416667,7.533333,2013,36,39889,19125,19256,20633,47.1,...,3977.0,5087.0,4958.0,1335,1020,1249,6495,1885,291.6550,0.028284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,32828,1412.714286,3.625000,2018,31,65665,21327,30928,34737,34.0,...,5583.0,15472.0,8267.0,3153,3359,4530,21487,5829,444.3875,
641,33172,1747.571429,3.625000,2018,75,43083,13687,21261,21822,41.4,...,4212.0,6942.0,7067.0,1744,1907,1736,9732,2452,444.3875,-0.041376
642,32827,1472.428571,3.625000,2018,30,9421,3498,4524,4897,37.1,...,628.0,2362.0,871.0,652,702,559,3109,720,444.3875,
643,33611,1349.666667,3.625000,2018,118,31791,14838,15806,15985,41.5,...,2479.0,8571.0,4273.0,1207,1227,1120,6596,1476,444.3875,


In [94]:
# total_df['median_pct_increase']

In [2]:
%store -r kmeans_df
kmeans_df

Unnamed: 0,zip_code,gini_index,median_age,cluster_label
353,33160,0.5958,49.0,4
354,33025,0.3857,35.6,3
355,33139,0.6167,41.0,1
356,33024,0.4448,36.8,3
358,33009,0.5042,45.5,0
...,...,...,...,...
467,33132,0.5005,36.8,3
469,33323,0.4037,38.1,1
471,33305,0.5365,50.7,4
472,33413,0.4191,34.7,3


In [16]:
%store -r jordan_kmeans_df
jordan_kmeans_df
col_set = set(jordan_kmeans_df.columns)

In [5]:
%store -r mini_df
mini_df

Unnamed: 0_level_0,million_dollar_housing_units,unemployed_pop,median_income,poverty,median_rent,percent_income_spent_on_rent
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
33160,821,1260.0,45627.0,5120.0,1258.0,34.0
33025,0,3667.0,51822.0,6940.0,1209.0,36.7
33139,829,1197.0,46612.0,6672.0,1036.0,31.8
32256,74,1943.0,54160.0,3937.0,862.0,28.7
33009,96,2823.0,33157.0,8460.0,942.0,39.3
...,...,...,...,...,...,...
32828,0,2046.0,70619.0,8296.0,1305.0,35.9
33172,0,989.0,46983.0,6123.0,1347.0,37.1
32827,59,199.0,76304.0,442.0,1508.0,27.0
33611,285,641.0,64523.0,2790.0,982.0,26.5


In [9]:
norm_mini_df = pd.DataFrame(normalize(mini_df,axis=0),columns=mini_df.columns)

In [10]:
norm_mini_df

Unnamed: 0,million_dollar_housing_units,unemployed_pop,median_income,poverty,median_rent,percent_income_spent_on_rent
0,0.140811,0.024504,0.034663,0.025473,0.047431,0.037577
1,0.000000,0.071315,0.039369,0.034527,0.045584,0.040561
2,0.142183,0.023279,0.035411,0.033194,0.039061,0.035146
3,0.012692,0.037787,0.041145,0.019587,0.032501,0.031720
4,0.016465,0.054901,0.025189,0.042090,0.035517,0.043435
...,...,...,...,...,...,...
640,0.000000,0.039790,0.053649,0.041274,0.049203,0.039677
641,0.000000,0.019234,0.035693,0.030463,0.050787,0.041003
642,0.010119,0.003870,0.057968,0.002199,0.056857,0.029841
643,0.048881,0.012466,0.049018,0.013881,0.037025,0.029288


In [13]:
ag = AgglomerativeClustering()
ag.set_params(n_clusters=3)
ag.fit(norm_mini_df)
mini_df['cluster'] = ag.labels_
mini_df

Unnamed: 0_level_0,million_dollar_housing_units,unemployed_pop,median_income,poverty,median_rent,percent_income_spent_on_rent,cluster
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
33160,821,1260.0,45627.0,5120.0,1258.0,34.0,1
33025,0,3667.0,51822.0,6940.0,1209.0,36.7,2
33139,829,1197.0,46612.0,6672.0,1036.0,31.8,1
32256,74,1943.0,54160.0,3937.0,862.0,28.7,0
33009,96,2823.0,33157.0,8460.0,942.0,39.3,2
...,...,...,...,...,...,...,...
32828,0,2046.0,70619.0,8296.0,1305.0,35.9,0
33172,0,989.0,46983.0,6123.0,1347.0,37.1,0
32827,59,199.0,76304.0,442.0,1508.0,27.0,0
33611,285,641.0,64523.0,2790.0,982.0,26.5,0
