In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from housing_functions import scale_data, calc_pca, calc_loadings, calc_kmeans
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Basic info

In [None]:
train.describe()

In [None]:
train.drop_duplicates()

# Separate numeric variable from categorical variable

In [None]:
numeric_data = train.select_dtypes(include=[np.number])
categorical_data = train.select_dtypes(exclude=[np.number])

print("numeric variables:",numeric_data.shape[1],"categorical variables:", categorical_data.shape[1])

In [None]:
scaled_data, scaled_data_df = scale_data(numeric_data)
np.any(np.isnan(scaled_data_df))
scaled_data_df = scaled_data_df.dropna()
numeric_data = numeric_data.dropna()

# PC Analysisshape

In [None]:
pca = calc_pca(scaled_data_df, 10)

In [None]:
reduced10 = pca.transform(scaled_data_df)
PCnames = ['PC'+str(i+1) for i in range(pca.n_components_)]
df_reduced = pd.DataFrame(reduced10, index=scaled_data_df.index, columns=PCnames)

In [None]:
pca_loadings = calc_loadings(pca, numeric_data)

In [None]:
loading_components = pca.components_.T * np.sqrt(pca.explained_variance_)
loading_matrix = pd.DataFrame(loading_components, columns=pca_loadings.columns, index=pca_loadings.index)

# KMeans analysis

In [None]:
model, kmeans_res = calc_kmeans(reduced10, n_clust=6, df_scaled=scaled_data_df)

In [None]:
result = pd.concat([numeric_data,df_reduced, kmeans_res], axis=1)

In [None]:
color_list= ['red', 'blue', 'green', 'yellow', 'cyan', 'magenta', 'gray', 'black']
palette = color_list[:6]

plt.figure(figsize=(10, 10))
sns.scatterplot(result['Id'], result['SalePrice'], hue=result.cluster, palette=palette)

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(result['YearBuilt'], result['SalePrice'], hue=result.cluster, palette=palette)

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(result['OverallQual'], result['SalePrice'], hue=result.cluster, palette=palette)

# Prediction analysis