## Cluster Model

#### Features chosen for PCA

**Numerical:**

- GrLivArea
- FirstFlrSF
- YearBuilt
- YearRemodAdd
- GarageYeBuilt

**Categorical**
 
- Utilities
- Street
- Condition2 (Proximity to various conditions)
- RoofMatl
- Heating


In [None]:
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
!pip install tqdm

In [None]:
cd ..

In [None]:
import lib.visuals as vs

In [None]:
IRIS = load_iris()

In [None]:
names = [(name
  .replace(" ", "")
  .replace("(cm)", "")) for name in IRIS.feature_names]

In [None]:
np.random.seed(42)

In [None]:
df_original = pd.DataFrame(IRIS.data)
df_original.columns = names
df_original['label'] = IRIS.target
df_original.label = df_original.label.astype('category')
df_original = pd.get_dummies(df_original)
df_original = df_original.sample(frac=1)

In [None]:
df_original.head()

In [None]:
def apply_scale(dataframe, scaling_function):
    numerical_df = dataframe.select_dtypes(include=[float])
    print(numerical_df.columns)
    numerical_df = scaling_function(numerical_df)
    tmp_df = dataframe.copy()
    tmp_df[numerical_df.columns] = numerical_df
    return tmp_df

def gelman_scale(dataframe):
    return (dataframe - dataframe.mean())/(2*dataframe.std())

def standard_scale(dataframe):
    return (dataframe - dataframe.mean())/(dataframe.std())

df_gelman = apply_scale(df_original, gelman_scale)
df_standard = apply_scale(df_original, standard_scale)

In [None]:
fig = plt.figure(figsize=(20,9))
for i, col in enumerate(df_original.columns):
    fig.add_subplot(3,7,1+i)
    sns.distplot(df_original[col])
    plt.xlim(-3,3)
    fig.add_subplot(3,7,8+i)
    sns.distplot(df_gelman[col])
    plt.xlim(-3,3)
    fig.add_subplot(3,7,15+i)
    sns.distplot(df_standard[col])
    plt.xlim(-3,3)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_original = PCA()
pca_gelman = PCA()
pca_standard = PCA()
pca_original.fit(df_original)
pca_gelman.fit(df_gelman)
pca_standard.fit(df_standard)

In [None]:
original_loading_plot = vs.FeatureLoadingsPlot(df_original, pca_original,7)
original_loading_plot.display_segments()

In [None]:
gelman_loading_plot = vs.FeatureLoadingsPlot(df_gelman, pca_gelman,7)
gelman_loading_plot.display_segments()

In [None]:
standard_loading_plot = vs.FeatureLoadingsPlot(df_standard, pca_standard,7)
standard_loading_plot.display_segments()

In [None]:
df_original_numerical = df_original.select_dtypes([float])
df_gelman_numerical = df_gelman.select_dtypes([float])
df_standard_numerical = df_standard.select_dtypes([float])

In [None]:
pca_original_numerical = PCA()
pca_gelman_numerical = PCA()
pca_standard_numerical = PCA()

In [None]:
pca_original_numerical.fit(df_original_numerical)
pca_gelman_numerical.fit(df_gelman_numerical)
pca_standard_numerical.fit(df_standard_numerical)

In [None]:
original_numerical_loadings = vs.FeatureLoadingsPlot(df_original_numerical, pca_original_numerical,4)
original_numerical_loadings.display_segments()

In [None]:
gelman_numerical_loadings = vs.FeatureLoadingsPlot(df_gelman_numerical, pca_gelman_numerical,4)
gelman_numerical_loadings.display_segments()

In [None]:
standard_numerical_loadings = vs.FeatureLoadingsPlot(df_standard_numerical, pca_standard_numerical,4)
standard_numerical_loadings.display_segments()

In [None]:
whos DataFrame

In [None]:
whos PCA

In [None]:
dims_4 = ['Dimension 1', 'Dimension 2', 'Dimension 3', 'Dimension 4']
dims_7 = dims_4 + ['Dimension 5', 'Dimension 6', 'Dimension 7']
df_original_pca = pd.DataFrame(pca_original.transform(df_original))
df_original_pca.columns = dims_7
df_original_numerical_pca = pd.DataFrame(pca_original_numerical.transform(df_original_numerical))
df_original_numerical_pca.columns = dims_4
df_gelman_pca = pd.DataFrame(pca_gelman.transform(df_gelman))
df_gelman_pca.columns = dims_7
df_gelman_numerical_pca = pd.DataFrame(pca_gelman_numerical.transform(df_gelman_numerical))
df_gelman_numerical_pca.columns = dims_4
df_standard_pca = pd.DataFrame(pca_standard.transform(df_standard))
df_standard_pca.columns = dims_7
df_standard_numerical_pca = pd.DataFrame(pca_standard_numerical.transform(df_standard_numerical))
df_standard_numerical_pca.columns = dims_4

In [None]:
whos DataFrame

In [None]:
gelman_pca_clusters = vs.Clusters(df_gelman_pca, [2,3,4])
gelman_pca_clusters.cluster_plots()

In [None]:
standard_pca_clusters = vs.Clusters(df_standard_pca, [2,3,4])
standard_pca_clusters.cluster_plots()

In [None]:
original_pca_clusters = vs.Clusters(df_original_pca, [2,3,4])
original_pca_clusters.cluster_plots()

In [None]:
gelman_numerical_pca_clusters = vs.Clusters(df_gelman_numerical_pca, [2,3,4])
gelman_numerical_pca_clusters.cluster_plots()

In [None]:
standard_numerical_pca_clusters = vs.Clusters(df_standard_numerical_pca, [2,3,4])
standard_numerical_pca_clusters.cluster_plots()

In [None]:
original_numerical_pca_clusters = vs.Clusters(df_original_numerical_pca, [2,3,4])
original_numerical_pca_clusters.cluster_plots()