## Cluster Model

#### Features chosen

**Numerical:**

- GrLivArea
- FirstFlrSF
- YearBuilt
- YearRemodAdd
- GarageYeBuilt

**Categorical**
 
- Utilities=AllPub
- Street=Pave
- Condition2=Norm (Proximity to various conditions)
- RoofMatl=CompShg
- Heating=GasA


In [None]:
!pip install tqdm

In [None]:
cd ..

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import lib.visuals as vs
from sklearn.decomposition import PCA

In [None]:
run src/load_data.py

In [None]:
np.random.seed(42)

In [None]:
top_10_features_df = numerical_features

one_hot_encoded_features = pd.get_dummies(categorical_features)
top_10_features_df["Utilities_AllPub"] = one_hot_encoded_features.Utilities_AllPub
top_10_features_df["Street_Pave"] = one_hot_encoded_features.Street_Pave
top_10_features_df["Condition2_Norm "] = one_hot_encoded_features.Condition2_Norm
top_10_features_df["RoofMatl_CompShg"] = one_hot_encoded_features.RoofMatl_CompShg
top_10_features_df["Heating_GasA"] = one_hot_encoded_features.Heating_GasA


In [None]:
top_10_features_df.head()

In [None]:
def apply_scale(dataframe, scaling_function):
    numerical_df = dataframe.select_dtypes(include=[float, int])
    print(numerical_df.columns)
    numerical_df = scaling_function(numerical_df)
    tmp_df = dataframe.copy()
    tmp_df[numerical_df.columns] = numerical_df
    return tmp_df

def gelman_scale(dataframe):
    return (dataframe - dataframe.mean())/(2*dataframe.std())

def standard_scale(dataframe):
    return (dataframe - dataframe.mean())/(dataframe.std())

In [None]:
df_gelman = apply_scale(top_10_features_df, gelman_scale)
df_standard = apply_scale(top_10_features_df, standard_scale)
df_gelman.head()

In [None]:
fig = plt.figure(figsize=(20,9))
for i, col in enumerate(top_10_features_df.columns):    
    fig.add_subplot(3,10,1+i)
    sns.distplot(df_gelman[col])
    plt.xlim(-3,3)
    fig.add_subplot(3,10,11+i)
    sns.distplot(df_standard[col])
    plt.xlim(-3,3)

#### We can see from the plots above that Gelman scaling makes the histogram slimmer with the points closer to the mean.

In [None]:
pca_gelman = PCA()
pca_standard = PCA()
pca_gelman.fit(df_gelman)
pca_standard.fit(df_standard)

### PCA using Gelman scaling

In [None]:
gelman_loading_plot = vs.FeatureLoadingsPlot(df_gelman, pca_gelman,4)
gelman_loading_plot.display_segments()

### PCA using Standard scaling

In [None]:
standard_loading_plot = vs.FeatureLoadingsPlot(df_standard, pca_standard,4)
standard_loading_plot.display_segments()

In [None]:
whos PCA

In [None]:
dims = ['Dimension 1', 'Dimension 2', 'Dimension 3', 'Dimension 4','Dimension 5', 'Dimension 6', 'Dimension 7', 
                    'Dimension 8', 'Dimension 9', 'Dimension 10']
df_gelman_pca = pd.DataFrame(pca_gelman.transform(df_gelman))
df_gelman_pca.columns = dims
df_standard_pca = pd.DataFrame(pca_standard.transform(df_standard))
df_standard_pca.columns = dims

In [None]:
whos DataFrame

### Clustering using PCA with Gelman scaling

In [None]:
gelman_pca_clusters = vs.Clusters(df_gelman_pca, [2,3,4])
gelman_pca_clusters.cluster_plots()

### Clustering using PCA with Standard scaling

In [None]:
standard_pca_clusters = vs.Clusters(df_standard_pca, [2,3,4])
standard_pca_clusters.cluster_plots()