# Capstone project 1: House prices advanced regression

## Machine Learning - Polynomial Model

In [1]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

In [2]:
# Load test and train data
Xtrain = pd.read_csv('./dummy_data/X_train.csv',index_col = 'Id')
Xtest = pd.read_csv('./dummy_data/X_test.csv',index_col = 'Id')
ytrain = pd.read_csv('./dummy_data/y_train.csv',header=None,index_col=0)
ytest = pd.read_csv('./dummy_data/y_test.csv',header=None,index_col=0)

### Full model

In [3]:
num_cols = Xtrain.columns[:36].drop('SoldTime')
cat_cols = Xtrain.columns[36:]

In [4]:
# transform to polynomial
transPoly = PolynomialFeatures(2)

In [5]:
# generate order=2 polynomial features for numerical columns 
Xtrain_cat = Xtrain[cat_cols]
Xtrain_num = transPoly.fit_transform(Xtrain[num_cols])
Xtest_cat = Xtest[cat_cols]
Xtest_num = transPoly.transform(Xtest[num_cols])

In [6]:
Xtrain_cat.shape,Xtrain_num.shape

((1022, 283), (1022, 666))

In [7]:
Xtest_cat.shape,Xtest_num.shape

((438, 283), (438, 666))

In [8]:
# combine numerical and categorical data together
X_train = [list(Xtrain_cat.iloc[i].values) + list(Xtrain_num[i]) for i in range(1022)]


In [9]:
X_test = [list(Xtest_cat.iloc[i].values) + list(Xtest_num[i]) for i in range(438)]

In [10]:
# linear regression
reg = LinearRegression().fit(X_train, ytrain)

In [25]:
X_train.shape,ytrain.shape

((1022, 105), (1022, 1))

In [11]:
# predicting testing results
predict = reg.predict(X_test)

In [12]:
predict.shape,ytest.shape

((438, 1), (438, 1))

In [13]:
# testing score
r2_score(predict,ytest) 

-0.0020996226672027607

In [14]:
# training score
r2_score(reg.predict(X_train),ytrain) 

0.9891409875982647

### Selected features

In [15]:
# According to feature EDA, sellect only the interesting features

interesting_columns = ['RoofMatl', 'Exterior1st','Neighborhood','LotFrontage', 'LotArea','OverallQual', \
                     'OverallCond', 'YearBuilt', 'YearRemodAdd','BsmtFinSF1','ExterCond','BsmtQual',\
                     'HeatingQC','CentralAir','KitchenQual','FireplaceQu','SaleType', 'SaleCondition', \
                     'MSSubClass','TotalBsmtSF', '1stFlrSF','2ndFlrSF','GrLivArea','GarageCars', 'GarageArea']

In [16]:
Features = []

In [17]:
# choose the columns that is in training data 
#(which is only numerical because categorical collumns are transformed to dummy tables with different column names)
for col in interesting_columns:
    if col in Xtrain.columns:
        Features.append(col)

In [18]:
print(Features)

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageCars', 'GarageArea']


In [19]:
Xtrain = Xtrain[Features]
Xtest = Xtest[Features]

In [20]:
# transform those numerical collumns to polynomial features
X_train = transPoly.fit_transform(Xtrain[Features])

X_test = transPoly.transform(Xtest[Features])

In [26]:
X_train.shape,X_test.shape,ytrain.shape,ytest.shape

((1022, 105), (438, 105), (1022, 1), (438, 1))

In [27]:
# 4-fold cross validation
cv = []
model = []
kf = KFold(n_splits=4)
for train_index, test_index in kf.split(X_train):
    Xtrain, Xtest = X_train[train_index], X_train[test_index]
    ytr, yte = ytrain.values[train_index], ytrain.values[test_index]
    reg = LinearRegression().fit(Xtrain, ytr) 
    model.append(reg)
    cv.append(r2_score(reg.predict(Xtest),yte))

In [28]:
cv

[0.2903768396325155,
 0.42388878466125146,
 0.8367384990692768,
 0.8412627027026077]

In [30]:
predict3 = model[3].predict(X_test) # predict testing data

In [31]:
r2_score(predict3,ytest) # testing score of model3

0.7263420274193868

In [32]:
predict2 = model[2].predict(X_test)# testing score of model2
r2_score(predict2,ytest)

0.7645308566369952

In [33]:
predict1 = model[1].predict(X_test)# testing score of model1
r2_score(predict1,ytest)

0.4149995140497805

In [34]:
predict = model[0].predict(X_test)# testing score of model0
r2_score(predict,ytest)

0.18856306567728798