In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import pdist, squareform

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AffinityPropagation

from tqdm.notebook import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
data_url = "https://docs.google.com/spreadsheets/d/198EG3tckqzD1uOKSYxAY62i5v_0LIZQMgzaIae6u1vo/export?format=csv"
zillow = pd.read_csv(data_url)

# Drop columns with higher than 20% missing values
drop = zillow.columns[zillow.isna().mean() > 0.2]
zillow = zillow.drop(columns=drop)

# Drop remaining missing values
zillow = zillow.dropna()

# Manually selected features to use
keep_cols = [
    # How much
    "taxamount",
    # How big
    "garagetotalsqft",
    "lotsizesquarefeet",
    "numberofstories",
    #     "poolcnt",  # dropped because 0 variance
    "roomcnt",
]

zillow = zillow[keep_cols]
# zillow = zillow.sample(1500, random_state=1337)

<IPython.core.display.Javascript object>

In [4]:
scaler = StandardScaler()
scaled = scaler.fit_transform(zillow)

<IPython.core.display.Javascript object>

In [5]:
sim_mat = -squareform(pdist(scaled))
sim_mat.min() - 1

-43.07119774057715

<IPython.core.display.Javascript object>

In [6]:
clst = AffinityPropagation(affinity="precomputed", damping=0.99, verbose=1)
clst.fit(sim_mat)

Converged after 15 iterations.


AffinityPropagation(affinity='precomputed', convergence_iter=15, copy=True,
                    damping=0.99, max_iter=200, preference=None, verbose=1)

<IPython.core.display.Javascript object>

In [7]:
cluster_df = zillow.copy()
cluster_df["label"] = clst.labels_
cluster_df["label"] = "Cluster " + cluster_df["label"].astype(str)
# sns.pairplot(cluster_df, hue="label")
# plt.show()

<IPython.core.display.Javascript object>

In [8]:
cluster_sizes = cluster_df["label"].value_counts().reset_index()
cluster_sizes.columns = ["label", "count"]

<IPython.core.display.Javascript object>

In [9]:
exemplar_idxs = clst.cluster_centers_indices_
exemplars = cluster_df.iloc[exemplar_idxs, :]

exemplars = pd.merge(exemplars, cluster_sizes, on="label")

exemplars.sort_values("taxamount").style.background_gradient()

Unnamed: 0,taxamount,garagetotalsqft,lotsizesquarefeet,numberofstories,roomcnt,label,count
21,6081.68,1835,162928.0,1,6,Cluster 21,52
36,6704.42,562,12403.0,2,8,Cluster 36,7994
29,12028.5,730,28448.0,2,10,Cluster 29,2704
25,12817.8,768,1743040.0,1,9,Cluster 25,1
26,13076.5,1097,1756660.0,2,10,Cluster 26,1
15,13301.0,1284,1125150.0,2,11,Cluster 15,1
22,13521.7,2183,91911.0,2,10,Cluster 22,30
31,13684.9,1735,925214.0,2,10,Cluster 31,4
13,14061.7,1079,950915.0,2,11,Cluster 13,6
32,14649.9,705,531867.0,2,9,Cluster 32,14


<IPython.core.display.Javascript object>

In [10]:
mean_cluster_df = cluster_df.groupby("label").mean()
mean_cluster_df.sort_values("taxamount").style.background_gradient()

Unnamed: 0_level_0,taxamount,garagetotalsqft,lotsizesquarefeet,numberofstories,roomcnt
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cluster 36,5799.89,531.932,13435.6,1.61659,7.54228
Cluster 21,9616.43,1435.42,98888.8,1.21154,6.69231
Cluster 8,12330.0,743.888,49622.7,1.02281,9.66316
Cluster 32,12636.2,769.286,459320.0,1.57143,8.57143
Cluster 31,12795.9,1680.0,889713.0,1.75,9.25
Cluster 25,12817.8,768.0,1743040.0,1.0,9.0
Cluster 29,12940.9,741.172,29465.3,1.99778,9.95636
Cluster 26,13076.5,1097.0,1756660.0,2.0,10.0
Cluster 15,13301.0,1284.0,1125150.0,2.0,11.0
Cluster 20,13773.3,668.388,53410.8,1.03571,7.35119


<IPython.core.display.Javascript object>

In [11]:
scaled_exemplars = pd.DataFrame(scaled[exemplar_idxs, :], columns=zillow.columns)
scaled_exemplars.std()

taxamount            6.906484
garagetotalsqft      5.916120
lotsizesquarefeet    7.957650
numberofstories      0.944576
roomcnt              1.972318
dtype: float64

<IPython.core.display.Javascript object>

In [12]:
cluster_df[cluster_df["label"] == "Cluster 4"]

Unnamed: 0,taxamount,garagetotalsqft,lotsizesquarefeet,numberofstories,roomcnt,label
40,29112.5,913.0,435600.0,2.0,11,Cluster 4
99,18528.66,1182.0,509788.0,2.0,13,Cluster 4
171,45064.24,840.0,512266.0,2.0,11,Cluster 4
297,20021.88,687.0,362419.0,1.0,12,Cluster 4
441,11305.14,586.0,292723.0,2.0,9,Cluster 4
513,26585.76,660.0,255822.0,2.0,12,Cluster 4
588,7814.86,614.0,305791.0,2.0,11,Cluster 4
635,23980.8,1027.0,275299.0,2.0,13,Cluster 4
832,19308.26,1073.0,435600.0,2.0,11,Cluster 4
839,17107.26,945.0,309276.0,2.0,9,Cluster 4


<IPython.core.display.Javascript object>

In [13]:
cluster_df[cluster_df["label"] == "Cluster 0"]

Unnamed: 0,taxamount,garagetotalsqft,lotsizesquarefeet,numberofstories,roomcnt,label
0,283062.46,4000.0,709157.0,2.0,21,Cluster 0


<IPython.core.display.Javascript object>