In [1]:
import geopandas as gpd
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
seg = gpd.read_file('pointsWithSeg.geojson')

In [3]:
# Initialize the Kmeans object
kmeans = KMeans(n_clusters=4, random_state=42)

cols = ['wall', 'lives', 'building', 'infrastructure', 'road', 'sidewalk', 'sky', 'green', 'transportation', 'publicservice']

clusteringData = seg[cols].copy()

# Scale the data features we want
scaler = StandardScaler()
scaledClusteringData = scaler.fit_transform(clusteringData)

In [4]:
# Run the fit!
kmeans.fit(scaledClusteringData)

# Save the cluster labels
seg['label'] = kmeans.labels_

In [5]:
names={1:'High-density',2:'Lush',3:"Spacious",0:"Townhouse"}
colors={1:'#E55756',2:'#55A24A',3:"#72B7B3",0:"#F58518"}

In [6]:
color_map = {'Townhouse': '#F58518',
 'High-density': '#E55756',
 'Lush': '#55A24A',
 'Spacious': '#72B7B3',
 'nan':"#cccccc"}

In [7]:
seg['color'] = [colors[x] for x in seg['label']]
seg['label'] = [names[x] for x in seg['label']]

In [8]:
segSummary = seg.groupby('label')[cols].mean().reset_index()
segSummary = segSummary.melt(id_vars=['label'],var_name='category',value_name='pct')
segSummary = pd.DataFrame(segSummary)
segSummary.head()

Unnamed: 0,label,category,pct
0,High-density,wall,0.028116
1,Lush,wall,0.019327
2,Spacious,wall,0.052916
3,Townhouse,wall,0.026113
4,High-density,lives,0.007247


In [11]:
bar = alt.Chart(segSummary).mark_bar().encode(
    y=alt.Y('label:N',title="Clustering Label"),
    color=alt.Color(
        'category:N',
        scale=alt.Scale(scheme="tableau20")),
    x=alt.X('pct:Q', stack="normalize",title="Category Percentage in Street View"),
    tooltip=[
        alt.Tooltip('label:N', title='Clustering Label'),
        alt.Tooltip('category:N', title='Catagory'),
        alt.Tooltip('pct:Q', title='Percentage',format=',.2f'), 
    ]
).properties(
    width=800,
    height=150
)

bar

alt.Chart.to_dict

bar.to_dict()

In [12]:
alt.Chart(seg).mark_geoshape(
).encode(
    color=alt.Color('color:N', scale=None)
).properties(
    width=800,
    height=800
)

In [13]:
blockGroup = gpd.read_file("censusRaceWithGeometry.geojson")

In [14]:
bg_cluster = gpd.sjoin(seg[["label","geometry"]], 
                       blockGroup[['NAME','geometry']],
                       how="left",)

In [15]:
# pd.DataFrame.sort_values()
bg_cluster = bg_cluster.groupby(["NAME","label"]).count().reset_index()\
    .sort_values("index_right",ascending=False)\
    .groupby(["NAME"]).nth(0)[["label"]]

In [16]:
blockGroupLabel = blockGroup.merge(bg_cluster,how="left",on='NAME')
blockGroupLabel =blockGroupLabel[["NAME","geometry",'label']]

In [17]:
alt.Chart(blockGroupLabel).mark_geoshape(
    fillOpacity=1,
    stroke='white'
).encode(
    color=alt.Color('label:N'),
    tooltip=[
        alt.Tooltip('label:N', title='Clustering Label'),
        alt.Tooltip('NAME:N', title='Block Group Name')
    ]
).properties(
    width=800,
    height=800
)

In [18]:
seg["id"] = seg['SEG_ID'].astype("str")+seg['pointId'].astype("str")

seg[["id",'geometry', 'label', 'color']].to_file("../web/data/p3-clustering-point.geojson",driver="GeoJSON")

In [461]:
blockGroupLabel["label"] = blockGroupLabel["label"].fillna("nan")
blockGroupLabel["color"] =[color_map[ x] for x in blockGroupLabel["label"] ]

blockGroupLabel.to_file("../web/data/p3-clustering-polygon.geojson",driver="GeoJSON")