In [2]:
import pandas as pd
import numpy as np
import scipy
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV

from sklearn import metrics

%matplotlib inline

In [6]:
test = pd.read_csv('datasets/test.csv')

In [12]:
#Id is just a observation number, so we are going to exclude it.
test = test.loc[:,test.columns != 'Id']
#Separate numerical and categorical data
numerical_test= test.select_dtypes(exclude=['object'])
categorical_test= test.select_dtypes('object')


In [8]:
#change ordinal features to numerical
ord_features = ['Lot Shape','Utilities','Land Slope','Exter Qual','Exter Cond','Bsmt Qual', 'Bsmt Cond', 
                'Bsmt Exposure','BsmtFin Type 1', 'BsmtFin Type 2','Heating QC','Electrical', 'Kitchen Qual',
                'Functional','Fireplace Qu','Garage Finish', 'Garage Qual','Garage Cond', 'Paved Drive', 'Pool QC', 
                'Fence']

In [9]:
#change ordinal features to numerical
test['Lot Shape'].replace({'Reg':1, 'IR1':2, 'IR2':3, 'IR3':4}, inplace=True)
test['Utilities'].replace({'AllPub':1, 'NoSewr':2, 'NoSeWa':3, 'ELO':4}, inplace=True)
test['Land Slope'].replace({'Gtl':1, 'Mod':2, 'Sev':3}, inplace=True)
test['Exter Qual'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5}, inplace=True)
test['Exter Cond'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5}, inplace=True)
test['Bsmt Qual'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Bsmt Cond'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Bsmt Exposure'].replace({'Gd':1, 'Av':2, 'Mn':3, 'No':4, 'NA':5}, inplace=True)
test['BsmtFin Type 1'].replace({'GLQ':1, 'ALQ':2, 'BLQ':3, 'Rec':4, 'LwQ':5, 'Unf':6, 'NA':7}, inplace=True)
test['BsmtFin Type 2'].replace({'GLQ':1, 'ALQ':2, 'BLQ':3, 'Rec':4, 'LwQ':5, 'Unf':6, 'NA':7}, inplace=True)
test['Heating QC'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5}, inplace=True)
test['Electrical'].replace({'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix':5}, inplace=True)
test['Kitchen Qual'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Functional'].replace({'Typ':1, 'Min1':2, 'Min2':3, 'Mod':4, 'Maj1':5, 'Maj2':6, 'Sev':7, 'Sal':8},
                            inplace=True)
test['Fireplace Qu'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Garage Finish'].replace({'Fin':1, 'RFn':2, 'Unf':3, 'NA':4}, inplace=True)
test['Garage Qual'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Garage Cond'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Garage Qual'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'Po':5, 'NA':6}, inplace=True)
test['Paved Drive'].replace({'Y':1, 'P':2, 'N':3}, inplace=True)
test['Pool QC'].replace({'Ex':1, 'Gd':2, 'TA':3, 'Fa':4, 'NA':5}, inplace=True)
test['Fence'].replace({'GdPrv':1, 'MnPrv':2, 'GdWo':3, 'MnWw':4, 'NA':5}, inplace=True)

#fill in 0 for missing values
train[ord_features] = train[ord_features].fillna(0)


In [14]:
#form a new numreical features
numerical_feature_test = list(numerical_test.columns) + ord_features

In [15]:
nom_features = np.setdiff1d(list(categorical_test.columns), ord_features)

In [16]:
# fill in None for nominal categorical features
test[nom_features] = test[nom_features].fillna('None')
test[nom_features].isnull().sum()

Alley           0
Bldg Type       0
Central Air     0
Condition 1     0
Condition 2     0
Exterior 1st    0
Exterior 2nd    0
Foundation      0
Garage Type     0
Heating         0
House Style     0
Land Contour    0
Lot Config      0
MS Zoning       0
Mas Vnr Type    0
Misc Feature    0
Neighborhood    0
Roof Matl       0
Roof Style      0
Sale Type       0
Street          0
dtype: int64

In [17]:
#fill in median for numerical features
numerical_test.isnull().sum()

PID                  0
MS SubClass          0
Lot Frontage       160
Lot Area             0
Lot Shape            0
Utilities            0
Land Slope           0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Mas Vnr Area         1
Exter Qual           0
Exter Cond           0
Bsmt Qual            0
Bsmt Cond            0
Bsmt Exposure        0
BsmtFin Type 1       0
BsmtFin SF 1         0
BsmtFin Type 2       0
BsmtFin SF 2         0
Bsmt Unf SF          0
Total Bsmt SF        0
Heating QC           0
Electrical           0
1st Flr SF           0
2nd Flr SF           0
Low Qual Fin SF      0
Gr Liv Area          0
Bsmt Full Bath       0
Bsmt Half Bath       0
Full Bath            0
Half Bath            0
Bedroom AbvGr        0
Kitchen AbvGr        0
Kitchen Qual         0
TotRms AbvGrd        0
Functional           0
Fireplaces           0
Fireplace Qu         0
Garage Yr Blt       45
Garage Finish        0
Garage Cars          0
Garage Area

In [18]:
#Lot Frontage
#fill in missing values with median in that neighborhood
train['Lot Frontage'] = train.groupby('Neighborhood')['Lot Frontage'].transform(lambda x: x.fillna(x.median()))

0

In [20]:
#Since there are all None, we are going to fill in the missing values with 0
test['Mas Vnr Area'] = test['Mas Vnr Area'].fillna(0)

In [21]:
test['Garage Yr Blt'] = test['Garage Yr Blt'].fillna(0)

In [22]:
test.isnull().sum().sum()

0

## Feature Enginnering 

In [23]:
#We are going to add some extra features

#total area: basement + living area above groud 
test['Total_Area'] = test['Total Bsmt SF'] + test['Gr Liv Area']

#total bathroom
test['Total_Bathroom'] = test['Bsmt Full Bath'] + 0.5*test['Bsmt Half Bath'] + test['Full Bath'] \
                                + 0.5*test['Half Bath']
#house age
test['House_age'] = test['Yr Sold'] - test['Year Built']

In [24]:
features = ['Total_Area', 'Overall Qual', 'Exter Qual', 'Kitchen Qual', 'Garage Area', 'Total_Bathroom',
           'Year Remod/Add', 'Mas Vnr Area', 'Bsmt Qual', 'Fireplaces','Heating QC', 'Neighborhood', 'Foundation', 
           'Garage Type','Mas Vnr Type', 'Exterior 1st', 'Exterior 2nd', 'Sale Type', 'MS Zoning']
X_t = test[features]


In [25]:
X_t_with_dummies = pd.get_dummies(X_t, columns=['Neighborhood', 'Foundation', 
           'Garage Type','Mas Vnr Type', 'Exterior 1st', 'Exterior 2nd', 'Sale Type', 'MS Zoning'], drop_first=True)

In [26]:
X_t_with_dummies.shape

(879, 92)

In [6]:
test = pd.read_csv('datasets/test.csv')

In [None]:
X_with_dummies.shape