Project #1 - Shraddha Upadhyay

In [1]:
import numpy as np
import pandas as pd
import itertools 
import prince
import gower
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA 
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.linear_model import Ridge
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.cluster import normalized_mutual_info_score

In [2]:
# Load data set
data = pd.read_csv('train.csv')
data = data.drop('Id', axis = 1)

# Remove columns that have too many missing values
data = data.drop(data.columns[data.isnull().sum() > 30], axis = 1)

# Remove missing values
data.dropna(inplace = True)

Your code starts here...

In [3]:
data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500


In [4]:
# Separating X and y
X = data.drop('SalePrice', axis = 1)
y = data.SalePrice

In [5]:
#Train, validate, test and split
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=201)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.2, random_state=301)

In [6]:
print('X train valid: ',X_train_valid.shape)
print('X test       : ',X_test.shape)
print('y train valid: ',y_train_valid.shape)
print('y test       : ',y_test.shape)
print('X train      : ',X_train.shape)
print('X valid      : ',X_valid.shape)
print('y train      : ',y_train.shape)
print('y valid      : ',y_valid.shape)

X train valid:  (1160, 63)
X test       :  (291, 63)
y train valid:  (1160,)
y test       :  (291,)
X train      :  (928, 63)
X valid      :  (232, 63)
y train      :  (928,)
y valid      :  (232,)


In [7]:
# Now we will separate our features into numerical and categorical types
XTrainValidNum = X_train_valid.select_dtypes(include=[np.number]) #for PCA
XTrainValidCat = X_train_valid.select_dtypes(exclude=[np.number]) #for MCA

XTrainNum = X_train.select_dtypes(include=[np.number]) #for PCA
XTrainCat = X_train.select_dtypes(exclude=[np.number]) #for MCA

XValidNum = X_valid.select_dtypes(include=[np.number]) #for PCA
XValidCat = X_valid.select_dtypes(exclude=[np.number]) #for MCA

XTestNum = X_test.select_dtypes(include=[np.number]) #for PCA
XTestCat = X_test.select_dtypes(exclude=[np.number]) #for MCA

In [8]:
# Printing the size 
print('X train validation numerical        : ',XTrainValidNum.shape)
print('X train validation categorical      : ',XTrainValidCat.shape)
print('X train numerical                   : ',XTrainNum.shape)
print('X train categorical                 : ',XTrainCat.shape)
print('X test numerical                    : ',XTestNum.shape)
print('X test categorical                  : ',XTestCat.shape)
print('X valid numerical                   : ',XValidNum.shape)
print('X valid categorical                 : ',XValidCat.shape)

X train validation numerical        :  (1160, 34)
X train validation categorical      :  (1160, 29)
X train numerical                   :  (928, 34)
X train categorical                 :  (928, 29)
X test numerical                    :  (291, 34)
X test categorical                  :  (291, 29)
X valid numerical                   :  (232, 34)
X valid categorical                 :  (232, 29)


In [9]:
# Making sure the training feature and testing feature has same number of levels
keep = XTrainValidCat.nunique() == XTestCat.nunique() 
XTrainValidCat = XTrainValidCat[XTrainValidCat.columns[keep]]
XTestCat = XTestCat[XTestCat.columns[keep]]
XTrainCat = XTrainCat[XTrainCat.columns[keep]]
XValidCat = XValidCat[XValidCat.columns[keep]]

In [10]:
keepN = XTrainCat.nunique() == XValidCat.nunique()
XTrainValidCat = XTrainValidCat[XTrainValidCat.columns[keepN]]
XTrainCat = XTrainCat[XTrainCat.columns[keepN]]
XValidCat = XValidCat[XValidCat.columns[keepN]]
XTestCat = XTestCat[XTestCat.columns[keepN]]

In [11]:
#For categorical features that have same levels, making sure the classes are the same
keep = []
for i in range(XTrainValidCat.shape[1]):
    keep.append(all(np.sort(XTrainValidCat.iloc[:,i].unique()) == np.sort(XTestCat.iloc[:,i].unique())))
XTrainValidCat = XTrainValidCat[XTrainValidCat.columns[keep]]
XTestCat = XTestCat[XTestCat.columns[keep]]
XTrainCat = XTrainCat[XTrainCat.columns[keep]]
XValidCat = XValidCat[XValidCat.columns[keep]]

In [12]:
keep_1 = []
for j in range(XTrainCat.shape[1]):
    keep_1.append(all(np.sort(XTrainCat.iloc[:,j].unique()) == np.sort(XValidCat.iloc[:,j].unique())))
XTrainCat = XTrainCat[XTrainCat.columns[keep_1]]
XValidCat = XValidCat[XValidCat.columns[keep_1]]
XTrainValidCat = XTrainValidCat[XTrainValidCat.columns[keep_1]]
XTestCat = XTestCat[XTestCat.columns[keep_1]]

In [13]:
# Let us scale the data for training and validation sets 
scaleddata = StandardScaler()
scaleddata.fit(XTrainNum)

XTrainNumScale = pd.DataFrame(scaleddata.transform(XTrainNum))
XValidNumScale = pd.DataFrame(scaleddata.transform(XValidNum))

In [14]:
# Let us create parameter lists for PCA and MCA
PComponent = range(4, 8) #PCA
MComponent = range(4, 8) #MCA
MIter = range(4, 8)  #MCA

ParameterList = list(itertools.product(PComponent, MComponent, MIter)) #List of paramters
valid_score = [] #Validation score

In [15]:
# Let us perform PCA and MCA
for param in ParameterList:
  pca = PCA(n_components=param[0], random_state=987)
  mca = prince.MCA(n_components=param[1], n_iter=param[2], random_state=987)
 
  # below we fit, transform and combine the numerical and categorical training data
  XTrainNumT = pca.fit_transform(XTrainNumScale)    # fitting and transforming the training set
  XTrainCatT = mca.fit_transform(XTrainCat)
  XTrainComb = np.concatenate([XTrainNumT, XTrainCatT], axis=1) # Combining the training set

  # fitting Knn model
  knn = KNeighborsRegressor()
  knn.fit(XTrainComb, y_train)

  # below we fit, transform and combine the numerical and categorical validation data
  XValidNumT = pca.transform(XValidNumScale)  # here we are transforming the validation set. we are not fitting it here
  XValidCatT = mca.transform(XValidCat)
  XValidCombined = np.concatenate([XValidNumT, XValidCatT], axis=1) # Combining the validation set

  # lets us append the validation score for the hyperparameters
  valid_score.append(mean_squared_error(knn.predict(XValidCombined), y_valid))

In [16]:
# let us try to find the best hyperparameters
best_params = ParameterList[np.argmin(valid_score)]
print("PCA components : ", best_params[0])
print("MCA components : ", best_params[1])
print("MCA iterations : ", best_params[2])

PCA components :  5
MCA components :  4
MCA iterations :  6


In [17]:
# let us scale our training,testing and validation data again for prediction
scaler_1 = StandardScaler()
scaler_1.fit(XTrainValidNum)

XTrainValidNumScale = pd.DataFrame(scaleddata.transform(XTrainValidNum))
XTestNumScale = pd.DataFrame(scaleddata.transform(XTestNum))

In [18]:
# let's print sizes of the sets
print('X train validation numerical        : ',XTrainValidNum.shape)
print('X train validation categorical      : ',XTrainValidCat.shape)
print('X train numerical                   : ',XTrainNum.shape)
print('X train categorical                 : ',XTrainCat.shape)
print('X test numerical                    : ',XTestNum.shape)
print('X test categorical                  : ',XTestCat.shape)
print('X valid numerical                   : ',XValidNum.shape)
print('X valid categorical                 : ',XValidCat.shape)

X train validation numerical        :  (1160, 34)
X train validation categorical      :  (1160, 9)
X train numerical                   :  (928, 34)
X train categorical                 :  (928, 9)
X test numerical                    :  (291, 34)
X test categorical                  :  (291, 9)
X valid numerical                   :  (232, 34)
X valid categorical                 :  (232, 9)


In [19]:
# let us perform PCA and MCA 
pca1 = PCA(n_components=best_params[0], random_state=657)
mca1 = prince.MCA(n_components=best_params[1], n_iter=best_params[2], random_state=657)
 
# below we fit, transform and combine the numerical and categorical training data
XTrainValidNumT = pca1.fit_transform(XTrainValidNum)  #Now We fit_transform & train+valid set
XTrainValidCatT = mca1.fit_transform(XTrainValidCat)
XTrainValidCombined = np.concatenate([XTrainValidNumT, XTrainValidCatT], axis=1)

# let's use knn regressor
knn = KNeighborsRegressor()
knn.fit(XTrainValidCombined, y_train_valid)

# below we transform and combine the numerical and categorical validation data
XTestNumT = pca1.transform(XTestNumScale) 
XTestCatT = mca1.transform(XTestCat)
XTestCombined = np.concatenate([XTestNumT, XTestCatT], axis=1) 

In [20]:
# printing error on the test set
print("Error on the test set is: ",metrics.mean_squared_error(knn.predict(XTestCombined), y_test))

Error on the test set is:  13159326099.05842


### Lets us apply Ridge Regression to predict the score on the test set

In [21]:
# for the regression, we need to create dummy variables
numer = data.select_dtypes(include=[np.number]) #for PCA
categ = data.select_dtypes(exclude=[np.number]) #for MCA

In [22]:
rid_cat= pd.get_dummies(data = categ, columns = categ.columns , drop_first = True)
rid_cat.head(3)

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [23]:
merged=numer.merge(rid_cat,left_index=True,right_index=True)

In [24]:
# separating X and y 
X = merged.drop(['SalePrice'], axis=1)
y = merged['SalePrice'].values

In [25]:
# let us separate train, test and validation sets
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.2, random_state = 111)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.2, random_state = 598)

In [26]:
# scaling the data
scaleddata = StandardScaler() # Instantiate
scaleddata.fit(X_train_valid) # fit
X_train_valid = pd.DataFrame(scaleddata.transform(X_train_valid)) # transform on train_valid set
X_test = pd.DataFrame(scaleddata.transform(X_test)) # Transform on test set
X_train_valid.columns = X.columns.values
X_test.columns = X.columns.values

In [27]:
# Setting up lambda values 
alphas = np.logspace(-10,5,10)

score2 = []
for a in alphas:
    newmodel = Ridge(alpha=a)
    newmodel.fit(X_train, y_train) # Fit
    score2.append(mean_squared_error(newmodel.predict(X_valid), y_valid)) # Evaluate the model

  return linalg.solve(A, Xy, sym_pos=True,


In [28]:
# calculating the validation error and lambda value
min(score2)
alphas[np.argmin(score2)]

1.0

In [29]:
# scaling the data
scaleddata.fit(X_train_valid) 
X_train_valid = pd.DataFrame(scaleddata.transform(X_train_valid)) 
X_test = pd.DataFrame(scaleddata.transform(X_test)) 

In [30]:
model3 = Ridge(alpha=alphas[np.argmin(score2)])
model3.fit(X_train_valid, y_train_valid)
print(pd.DataFrame(zip(model3.coef_,X.columns)))
print("The prediction error on the testing set is", np.sqrt(mean_squared_error(model3.predict(X_test), y_test)))

                0                      1
0    -4830.915822             MSSubClass
1     6112.598070                LotArea
2    11012.352414            OverallQual
3     5663.572065            OverallCond
4    10141.568018              YearBuilt
..            ...                    ...
191    645.373013  SaleCondition_AdjLand
192    378.029812   SaleCondition_Alloca
193   -667.087882   SaleCondition_Family
194   2417.043797   SaleCondition_Normal
195  -3887.442124  SaleCondition_Partial

[196 rows x 2 columns]
The prediction error on the testing set is 27246.55704656617


## Clustering

In [31]:
# lets compute the gower distance 
X = data.drop(['SalePrice'], axis=1) 
gower = gower.gower_matrix(X)

In [32]:
# finding k-medoids
med = KMedoids(n_clusters=5, random_state = 300)

In [33]:
cls = med.fit(gower)

In [34]:
kmedLabel = med.labels_   

In [35]:
# putting variables into equal-sized buckets
data['Cluster'] = pd.qcut(data.SalePrice,q=5,labels=[1,2,3,4,5])

In [36]:
# calculating and printing the NMI (Normalized Mutual Information) score
print("The NMI score is: ",normalized_mutual_info_score(kmedLabel,data['Cluster'] )) 

The NMI score is:  0.2149850057771022


We can see that the NMI score is approximately 0.2 which is very close to 0 on a scale of 0 to 1, which means that there is very less correlation between our clusters. To improve these results, let us try different cluster sizes.

In [37]:
# let us check with the cluster value of 10
from sklearn_extra.cluster import KMedoids
med2 = KMedoids(n_clusters=10, random_state = 431)
cls2 = med2.fit(gower)
kmedLabel2 = med2.labels_
data['Cluster'] = pd.qcut(data.SalePrice,q=10,labels=[1,2,3,4,5,6,7,8,9,10])

print("The NMI score is: ",normalized_mutual_info_score(kmedLabel2,data['Cluster'] ))

The NMI score is:  0.2122413966443631


In [38]:
# let us check with the cluster value of 20
med3 = KMedoids(n_clusters=20, random_state = 345)
cls3 = med3.fit(gower)
kmedLabel3 = med3.labels_
data['Cluster'] = pd.qcut(data.SalePrice,q=20,labels=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])

print("The NMI score is: ",normalized_mutual_info_score(kmedLabel3,data['Cluster'] ))

The NMI score is:  0.2280597037976478


We can observe that as we keep increasing the value of clusters, the NMI score keeps increasing. 