In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# Load data set
data = pd.read_csv('train (1).csv')
data = data.drop('Id', axis = 1)

# Remove columns that have too many missing values
data = data.drop(data.columns[data.isnull().sum() > 30], axis = 1)

# Remove missing values
data.dropna(inplace = True)

# Prep Data

In [3]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [4]:
# Define our predictor and response variable 
X = data.copy()
del X['SalePrice']
y = data['SalePrice']

# Check what kind of data types in X 
data_type = X.dtypes
data_type.value_counts()

int64      33
object     29
float64     1
dtype: int64

In [5]:
# Train/ validation/ test split
from sklearn.model_selection import train_test_split

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X,y,test_size = 0.3, random_state = 1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.3, random_state = 1)


# Part 1: Dimension Reduction

## Select the best hyperparameters using Train and Valid datasets

In [6]:
# Split numerical and categorical variables in training dataset 

X_train_n = X_train.select_dtypes(include = ['int64','float64'])
X_train_c = X_train.select_dtypes(include = 'object')

# Split numerical and categorical variables in validation dataset 

X_valid_n = X_valid.select_dtypes(include = ['int64','float64'])
X_valid_c = X_valid.select_dtypes(include = 'object')

In [7]:
# Script to check categorical features have the same levels
keep = X_train_c.nunique() == X_valid_c.nunique()
X_train_c = X_train_c[X_train_c.columns[keep]]
X_valid_c = X_valid_c[X_valid_c.columns[keep]]

keep = []
for i in range(X_train_c.shape[1]):
    keep.append(all(np.sort(X_train_c.iloc[:,i].unique()) == np.sort(X_valid_c.iloc[:,i].unique())))
X_train_c = X_train_c[X_train_c.columns[keep]]
X_valid_c = X_valid_c[X_valid_c.columns[keep]]


### - Converted the data into predictor and response variables. here Salesprice is our response variable.
### - Split the data into train-validation & test set
### - Separated our categorical data from numerical data, so we can use them to train PCA & MCA
### - Check for the same number of variable features in our training and testing data.

In [8]:
# Standardize our training and validation data based on the training set
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
scaler.fit(X_train_n) 
X_train_n = pd.DataFrame(scaler.transform(X_train_n))
X_valid_n = pd.DataFrame(scaler.transform(X_valid_n))

In [9]:
X_train_c

Unnamed: 0,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,BldgType,MasVnrType,ExterQual,CentralAir,KitchenQual,PavedDrive
1440,Pave,IR1,Bnk,Inside,Mod,Crawfor,1Fam,,TA,Y,TA,Y
476,Pave,Reg,Lvl,Inside,Mod,CollgCr,1Fam,BrkFace,TA,Y,Gd,Y
1308,Pave,Reg,Lvl,Inside,Gtl,OldTown,1Fam,,TA,Y,Gd,Y
699,Pave,IR2,Lvl,Inside,Gtl,Somerst,TwnhsE,,Gd,Y,Gd,Y
1441,Pave,Reg,Lvl,Inside,Gtl,CollgCr,TwnhsE,BrkFace,Gd,Y,Gd,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
441,Pave,Reg,Lvl,Inside,Gtl,Edwards,Duplex,BrkFace,TA,N,Fa,Y
1300,Pave,IR1,Lvl,CulDSac,Gtl,Gilbert,1Fam,,Gd,Y,Gd,Y
280,Pave,Reg,Lvl,Inside,Gtl,SawyerW,1Fam,BrkFace,Gd,Y,Gd,Y
375,Pave,IR1,Low,Inside,Sev,Edwards,1Fam,,Fa,N,Fa,Y


In [10]:
# Create a list of all parameter combinations for our model to test
from itertools import product

pca_components = range(2,31)
mca_components = range(2,11)

param_lst= list(product(pca_components, mca_components))


In [11]:
# Create a loop to find the best parameter combination by evaluating the MSE
from sklearn.decomposition import PCA
from prince import MCA 
from sklearn import linear_model 
from sklearn import metrics
from sklearn.cluster import KMeans


mse_score=[] 


for x in param_lst:
    pca = PCA(n_components=x[0], svd_solver='randomized', random_state=1)
    X_train_n2 = pca.fit_transform(X_train_n)
    X_train_n2 = pd.DataFrame(data=X_train_n2)
   
    # MCA
    mca = MCA(n_components=x[1], random_state=1)
    X_train_c2 = mca.fit_transform(X_train_c)
    X_train_c2.reset_index(inplace=True)
    
    # Combine 
    X_train2 = pd.concat([X_train_n2,X_train_c2], axis=1)
    
    # K-Means
   
        
                
    Kmeans=KMeans(n_clusters=5, max_iter=400 ).fit(X_train2)
    
    
        #Transform and combine validation data
    X_valid_n2 = pca.transform(X_valid_n)
    X_valid_n2 = pd.DataFrame(data=X_valid_n2)
    
    X_valid_c2 = mca.transform(X_valid_c)
    X_valid_c2.reset_index(inplace=True)
    
    X_valid2 = pd.concat([X_valid_n2,X_valid_c2], axis=1)
    
    # Evaluate
    mse_score.append(metrics.mean_squared_error(Kmeans.predict(X_valid2), y_valid))

    
    
# Select best parameters 
best = np.argmin(mse_score)
best_param = param_lst[best]
best_param

(30, 4)

### - We scaled our data before feeding it for dimensionality reduction
### - We tune PCA on numerical data and MCA on categorical data
### - Then, using fit_transform() function, we fit the model with train data

## Evaluate model using combined train/valid and test datasets

In [12]:
# Split numerical and categorical variables in training dataset 

X_train_valid_n = X_train_valid.select_dtypes(include = ['int64','float64'])
X_train_valid_c = X_train_valid.select_dtypes(include = 'object')

# Split numerical and categorical variables in validation dataset 

X_test_n = X_test.select_dtypes(include = ['int64','float64'])
X_test_c = X_test.select_dtypes(include = 'object')

In [13]:
# Script to check categorical features have the same levels
keep = X_train_valid_c.nunique() == X_test_c.nunique()
X_train_valid_c = X_train_valid_c[X_train_valid_c.columns[keep]]
X_test_c = X_test_c[X_test_c.columns[keep]]

keep = []
for i in range(X_train_valid_c.shape[1]):
    keep.append(all(np.sort(X_train_valid_c.iloc[:,i].unique()) == np.sort(X_test_c.iloc[:,i].unique())))
X_train_valid_c = X_train_valid_c[X_train_valid_c.columns[keep]]
X_test_c = X_test_c[X_test_c.columns[keep]]


In [55]:
# Standardize our combinred training/validation and test data 
scaler.fit(X_train_valid_n) 
X_train_valid_n = pd.DataFrame(scaler.transform(X_train_valid_n))
X_test_n = pd.DataFrame(scaler.transform(X_test_n))

In [56]:
# Build model using best hyperparameter


#PCA
pca = PCA(n_components=best_param[0], svd_solver='randomized', random_state=1)
X_train_valid_n2 = pca.fit_transform(X_train_valid_n)
X_train_valid_n2 = pd.DataFrame(data=X_train_valid_n2)
   
# MCA
mca = MCA(n_components=best_param[1], random_state=1)
X_train_valid_c2 = mca.fit_transform(X_train_valid_c)
X_train_valid_c2.reset_index(inplace=True)
    
# Combine 
X_train_valid2 = pd.concat([X_train_valid_n2,X_train_valid_c2], axis=1)
    
# Regression 
kmeans = KMeans(n_clusters=7)
kmeans.fit(X_train_valid2)
    
#Transform and combine validation data
X_test_n2 = pca.transform(X_test_n)
X_test_n2 = pd.DataFrame(data=X_test_n2)
    
X_test_c2 = mca.transform(X_test_c)
X_test_c2.reset_index(inplace=True)
    
X_test2 = pd.concat([X_test_n2,X_test_c2], axis=1)
    
# Evaluate
np.round(metrics.mean_squared_error(kmeans.predict(X_test2), y_test))

36934854714.0

In [57]:
Kmeans_pred=pd.DataFrame(kmeans.labels_)
#kmeans.predict(X_test2)
y_labels = pd.qcut(y_train_valid,7)
y_labels.sort_index()
#normalized_mutual_info_score(y_labels,Kmeans_pred)
y_labels

1042      (175000.0, 206471.429]
861     (114929.143, 134714.286]
1215    (114929.143, 134714.286]
491     (114929.143, 134714.286]
740     (114929.143, 134714.286]
                  ...           
718         (256400.0, 755000.0]
908     (114929.143, 134714.286]
1102      (134714.286, 151000.0]
236       (175000.0, 206471.429]
1067        (151000.0, 175000.0]
Name: SalePrice, Length: 1015, dtype: category
Categories (7, interval[float64]): [(35310.999, 114929.143] < (114929.143, 134714.286] < (134714.286, 151000.0] < (151000.0, 175000.0] < (175000.0, 206471.429] < (206471.429, 256400.0] < (256400.0, 755000.0]]

In [58]:
Kmeans_pred

Unnamed: 0,0
0,1
1,4
2,1
3,3
4,2
...,...
1010,2
1011,4
1012,1
1013,0


### - We imput our best parameteres from Hyper-parameter tuning, with validation set to check the results on test set.  

In [53]:
Kmeans.labels_

array([4, 3, 4, 0, 4, 0, 4, 0, 0, 0, 3, 4, 0, 0, 3, 0, 0, 3, 3, 0, 1, 4,
       0, 1, 3, 1, 2, 0, 2, 1, 4, 2, 3, 1, 2, 4, 0, 4, 1, 4, 2, 1, 2, 2,
       2, 0, 4, 2, 3, 1, 1, 1, 1, 2, 2, 0, 4, 1, 0, 3, 2, 3, 1, 1, 1, 0,
       3, 3, 2, 1, 1, 4, 0, 3, 3, 3, 1, 1, 0, 4, 3, 2, 2, 1, 4, 0, 1, 3,
       1, 4, 2, 2, 0, 3, 4, 0, 3, 4, 3, 4, 4, 0, 2, 4, 3, 4, 3, 3, 0, 3,
       4, 0, 1, 4, 3, 2, 1, 1, 0, 4, 4, 0, 3, 4, 1, 3, 3, 0, 0, 0, 2, 3,
       3, 1, 0, 3, 4, 1, 1, 1, 0, 1, 2, 3, 4, 4, 2, 4, 2, 1, 3, 2, 3, 0,
       0, 2, 1, 2, 1, 4, 4, 2, 4, 1, 0, 4, 3, 3, 2, 4, 0, 4, 4, 1, 4, 3,
       0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 3, 3, 3, 3, 3, 1, 4, 2, 4, 3, 3, 1,
       2, 2, 2, 4, 0, 4, 2, 0, 3, 0, 1, 4, 4, 0, 2, 4, 1, 3, 0, 0, 0, 1,
       3, 2, 2, 4, 1, 1, 4, 4, 1, 0, 3, 2, 0, 3, 4, 1, 2, 4, 0, 4, 2, 3,
       3, 0, 1, 3, 1, 4, 1, 3, 1, 4, 3, 4, 3, 0, 1, 2, 3, 0, 1, 1, 2, 3,
       4, 2, 2, 0, 3, 2, 1, 3, 2, 0, 0, 1, 2, 0, 2, 1, 4, 3, 4, 4, 2, 2,
       0, 1, 2, 3, 0, 3, 0, 0, 1, 0, 2, 3, 3, 1, 2,

## Baseline model using dummy variables

In [None]:
# Convert categorical variables to dummy variables
X_train_valid_c.reset_index(inplace=True)
X_train_valid3 = pd.concat([X_train_valid_n,X_train_valid_c], axis=1)
X_train_valid4 = pd.get_dummies(X_train_valid3, drop_first=True)

X_test_c.reset_index(inplace=True)
X_test3 = pd.concat([X_test_n,X_test_c], axis=1)
X_test4 = pd.get_dummies(X_test3, drop_first=True)

In [None]:
# Fit and eval regression model
lm = linear_model.Ridge()
lm.fit(X_train_valid4, y_train_valid)
metrics.mean_squared_error(lm.predict(X_test4), y_test)

### Observation:  We observed the baseline regression model has a lower error score then the model with reduced dimensions, with the set of hyperparameters used.

# Part II: Clustering Analysis

In [47]:
#Implementing necessary clustering libraries
import gower
from sklearn.metrics.cluster import normalized_mutual_info_score
from pyclustering.cluster.kmedoids import kmedoids

In [48]:
# Find the Gower distance for or original data
gow_dist=gower.gower_matrix(X)


In [49]:
#create random centers index for clustering
center_index = np.random.randint(0, 1451, 5) # 5 clusters 
centers = gow_dist[center_index] 

In [50]:
# kmedoid model 
k_med = kmedoids(gow_dist,center_index )
k_med.process()

# get clusters 
clusters = k_med.get_clusters() 

In [21]:
# Assigning the cluster labels to the cluster points
lst = []
for label in range(len(clusters)):
    index = clusters[label]
    df = X.iloc[index]
    df['pred_label'] = label
    lst.append(df)
    
df = pd.concat([lst[0],lst[1],lst[2],lst[3],lst[4]])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,pred_label
814,50,RL,8248,Pave,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,96,0,0,0,0,7,2006,WD,Normal,0
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,0
8,50,RM,6120,Pave,Reg,Lvl,AllPub,Inside,Gtl,OldTown,...,205,0,0,0,0,4,2008,WD,Abnorml,0
9,190,RL,7420,Pave,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,...,0,0,0,0,0,1,2008,WD,Normal,0
15,45,RM,6120,Pave,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,...,0,0,0,0,0,7,2007,WD,Normal,0


In [22]:
# Divide labels 
labels = pd.qcut(y,5)

In [23]:
normalized_mutual_info_score(labels,df['pred_label'])

0.002007699568728918

### We fit the k medoid model, with distance measure as Gower distance, 
### We bin the original response variable "SalePrice" with the same number of clusters(5 here), we determined for our model.
### We use NMI(normalized_mutual_info_score) score, to identify our model performance

Repeat steps but with 3 clusters 

In [24]:
#Create random centers index for clustering
center_index = np.random.randint(0, 1451, 3) # 3 clusters 
centers = gow_dist[center_index] 

In [25]:
# kmedoid model 
k_med = kmedoids(gow_dist,center_index )
k_med.process()

# get clusters 
clusters = k_med.get_clusters() 

In [26]:
lst = []
for label in range(len(clusters)):
    index = clusters[label]
    df = X.iloc[index]
    df['pred_label_2'] = label
    lst.append(df)
    
df = pd.concat([lst[0],lst[1],lst[2]])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,pred_label_2
542,20,RL,10140,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,0,0,0,0,0,6,2009,WD,Normal,0
11,60,RL,11924,Pave,IR1,Lvl,AllPub,Inside,Gtl,NridgHt,...,0,0,0,0,0,7,2006,New,Partial,0
13,20,RL,10652,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,8,2007,New,Partial,0
20,60,RL,14215,Pave,IR1,Lvl,AllPub,Corner,Gtl,NridgHt,...,0,0,0,0,0,11,2006,New,Partial,0
22,20,RL,9742,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,0


In [27]:
# Divide labels 
labels = pd.qcut(y,3)

In [28]:
normalized_mutual_info_score(labels,df['pred_label_2'])

0.0015752135230510185

### Finally, we run the Clustering with 3 clusters, to check the performance.
### Our model performs better, with 5 clusters than, with 3 clusters.