In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.cluster import KMeans, AgglomerativeClustering


In [2]:
pd.__version__

'1.3.5'

In [3]:
%matplotlib inline
import plotly.express as px
import matplotlib.pyplot as plt

import dataframe_image as dfi

In [4]:
with open('../../data/fl_florida_zip_codes_geo.min.json', 'r') as f:
    fla_shapefiles = json.load(f)

In [9]:
total_df = pd.read_csv('../../data/total_df2.csv')
total_df_col_set = set(total_df.columns)
total_df_col_set

{'FLSTHPI_Yearly_Avg',
 'FL_Unemployment',
 'Monthly_Avg_Zip',
 'aggregate_travel_time_to_work',
 'armed_forces',
 'associates_degree',
 'bachelors_degree',
 'bachelors_degree_2',
 'bachelors_degree_or_higher_25_64',
 'children',
 'children_in_single_female_hh',
 'civilian_labor_force',
 'commute_10_14_mins',
 'commute_15_19_mins',
 'commute_20_24_mins',
 'commute_25_29_mins',
 'commute_30_34_mins',
 'commute_35_44_mins',
 'commute_45_59_mins',
 'commute_60_more_mins',
 'commute_less_10_mins',
 'commuters_16_over',
 'commuters_by_bus',
 'commuters_by_car_truck_van',
 'commuters_by_carpool',
 'commuters_by_public_transportation',
 'commuters_by_subway_or_elevated',
 'commuters_drove_alone',
 'different_house_year_ago_different_city',
 'different_house_year_ago_same_city',
 'dwellings_10_to_19_units',
 'dwellings_1_units_attached',
 'dwellings_1_units_detached',
 'dwellings_20_to_49_units',
 'dwellings_2_units',
 'dwellings_3_to_4_units',
 'dwellings_50_or_more_units',
 'dwellings_5_to_9

In [None]:
total_df = total_df.set_index('zip_code')
total_df.shape

In [None]:
cluster_subset_cols = ['million_dollar_housing_units', 'unemployed_pop', 
        'median_income', 'poverty', 
        'median_rent', 'percent_income_spent_on_rent']

In [None]:
subset_df = total_df[cluster_subset_cols]
subset_df.shape

In [None]:
%cd ../../images
subdir = "subset4"
!mkdir subset4
%cd ../notebooks/template_instantiations

In [None]:
######### K Means

In [None]:
km = KMeans(n_clusters = 3).fit(subset_df)

In [None]:
km_subset_df = subset_df.copy()
km_subset_df['km_cluster'] = km.labels_
km_subset_df

In [None]:
total_merge_km = pd.merge(total_df, km_subset_df, on = km_subset_df.columns.tolist()[:-1] + ['zip_code'])

In [None]:
total_merge_km

In [None]:
total_merge_km_group_by_cluster = total_merge_km.groupby('km_cluster')['median_pct_increase']\
    .aggregate(['mean'])

total_merge_km_group_by_cluster = total_merge_km_group_by_cluster.style\
    .hide_columns()\
    .set_caption(
    "Median YOY Rent Change By Cluster, KMeans"
)

total_merge_km_group_by_cluster

In [None]:
image_name = "km_numbers.png"
dfi.export(total_merge_km_group_by_cluster,"../../images/" + subdir + "/" + image_name)

In [None]:
total_merge_km.reset_index(inplace=True)
total_merge_km = total_merge_km.rename(columns = {'index':'zipcode'})

In [None]:
# fig = px.choropleth_mapbox(total_merge_km, geojson=fla_shapefiles, locations='zip_code', color='km_cluster',
#                            featureidkey='properties.ZCTA5CE10',
#                            color_continuous_scale="Viridis",
#                            mapbox_style="carto-positron",
#                            zoom=7.2, center = {"lat": 26.2, "lon": -81.0 },
#                            opacity=0.5,
#                           )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [None]:
image_name = "km_map.png"
fig.write_image("../../images/" + subdir + "/" + image_name)

In [None]:

cluster_0_km = total_merge_km[total_merge_km['km_cluster'] == 0]
cluster_1_km = total_merge_km[total_merge_km['km_cluster'] == 1]
cluster_2_km = total_merge_km[total_merge_km['km_cluster'] == 2]


In [None]:
image_name = "km_scatter.png"
plt.figure()
plt.scatter(cluster_0_km['median_income'], cluster_0_km.Monthly_Avg_Zip, color="blue", label='0')
plt.scatter(cluster_1_km['median_income'], cluster_1_km.Monthly_Avg_Zip, color="orange", label='1')
plt.scatter(cluster_2_km['median_income'], cluster_2_km.Monthly_Avg_Zip, color="green", label='2')
plt.xlabel('Income per Capita')
plt.ylabel('Monthly Average Rent')
plt.legend()
plt.title('Income per capita vs. Monthly Average Rent')
plt.savefig("../../images/" + subdir + "/" + image_name,  bbox_inches='tight')
plt.show()

In [None]:
########## agglomerative clustering ######

In [None]:
norm_subset_df = pd.DataFrame(normalize(subset_df,axis=0),columns=subset_df.columns)

In [None]:
norm_subset_df

In [None]:
ag = AgglomerativeClustering()
ag.set_params(n_clusters=3)
ag.fit(norm_subset_df)

ag_subset_df = subset_df.copy()
ag_subset_df['cluster'] = ag.labels_
ag_subset_df

In [None]:
total_merge_ag = pd.merge(total_df, ag_subset_df, on = ag_subset_df.columns.tolist()[:-1] + ['zip_code'])
total_merge_ag

In [None]:
total_merge_ag_group_by_cluster = total_merge_ag.groupby('cluster')['median_pct_increase']\
    .aggregate(['mean'])

total_merge_ag_group_by_cluster = total_merge_ag_group_by_cluster.style\
    .hide_columns()\
    .set_caption(
    "Median YOY Rent Change By Cluster, Agglomerative"
)

total_merge_ag_group_by_cluster

In [None]:
image_name = "ag_numbers"
dfi.export(total_merge_ag_group_by_cluster,"../../images/" + subdir + "/" + image_name)

In [None]:
total_merge_ag.reset_index(inplace=True)
total_merge_ag = total_merge_ag.rename(columns = {'index':'zipcode'})

In [None]:
# fig = px.choropleth_mapbox(total_merge_ag, geojson=fla_shapefiles, locations='zip_code', color='cluster',
#                            featureidkey='properties.ZCTA5CE10',
#                            color_continuous_scale="Viridis",
#                            mapbox_style="carto-positron",
#                            zoom=7.2, center = {"lat": 26.2, "lon": -81.0 },
#                            opacity=0.5,
#                           )
# fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show()

In [None]:
image_name = "ag_map.png"
fig.write_image("../../images/" + subdir + "/" + image_name)

In [None]:
cluster_0_ag = total_merge_ag[total_merge_ag['cluster'] == 0]
cluster_1_ag = total_merge_ag[total_merge_ag['cluster'] == 1]
cluster_2_ag = total_merge_ag[total_merge_ag['cluster'] == 2]

In [None]:
image_name = "ag_scatter.png"
plt.figure()
plt.scatter(cluster_0_ag['median_income'], cluster_0_ag.Monthly_Avg_Zip, color='blue', label='0' )
plt.scatter(cluster_1_ag['median_income'], cluster_1_ag.Monthly_Avg_Zip, color='orange', label='1')
plt.scatter(cluster_2_ag['median_income'], cluster_2_ag.Monthly_Avg_Zip, color='green', label='2')
plt.xlabel('Income per Capita')
plt.ylabel('Monthly Average Rent')
plt.legend()
plt.title('Income per capita vs. Monthly Average Rent')
plt.savefig("../../images/" + subdir + "/" + image_name,  bbox_inches='tight')
plt.show()