# House prices



In [343]:
# import libraries
import matplotlib.pyplot as plt                      # data visualization
import pandas as pd                                  # data science essentials
from sklearn.model_selection import train_test_split # train-test split
import sklearn.linear_model as linear_model          # linear modeling in scikit-learn
import sklearn.ensemble as ensemble                  # tree regressor in scikit-learn
import numpy as np                                   # numpy library for math functions and arrays
from sklearn.metrics import mean_squared_error

In [344]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [345]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [346]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## Data cleaning and exploration

In [347]:
# columns_with_null = train.isna().sum().index
# drop null values to start with, might need to manage null values to avoud losing data
train.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

## Cleaning numeric features

In [348]:
numeric_features = train.select_dtypes(include=np.number).columns

In [349]:
#filling missing values for numeric values
for column in numeric_features:
    if train[column].isna().sum() > 0:
        column_median = train[column].median()
        train[column] = train[column].fillna(column_median)[0]

In [350]:
train[numeric_features].isna().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

## Cleaning categorical features

In [351]:
categorical_features = set(train.columns) - set(numeric_features)
categorical_features

{'Alley',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'MiscFeature',
 'Neighborhood',
 'PavedDrive',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities'}

In [352]:
# clean categorical features using mode
electrical_mode = train['Electrical'].mode()
train['Electrical'] = train['Electrical'] = train['Electrical'].fillna(electrical_mode)[0]
fence_mode = train['Fence'].mode()
train['Fence'] = train['Fence'].fillna(fence_mode)[0]
masvnrtype_mode = train['MasVnrType'].mode()
train['MasVnrType'] = train['MasVnrType'].fillna(masvnrtype_mode)[0]

In [353]:
# clean other categorical features using "NA" where it's an option
train["FireplaceQu"].fillna(value = "NA", inplace = True)
train["GarageCond"].fillna(value = "NA", inplace = True)
train["GarageYrBlt"].fillna(value = "NA", inplace = True)
train["GarageType"].fillna(value = "NA", inplace = True)
train["GarageFinish"].fillna(value = "NA", inplace = True)
train["PoolQC"].fillna(value = "NA", inplace = True)
train["BsmtQual"].fillna(value = "NA", inplace = True)
train["GarageQual"].fillna(value = "NA", inplace = True)
train["Alley"].fillna(value = "NA", inplace = True)
train["BsmtFinType1"].fillna(value = "NA", inplace = True)
train["BsmtFinType2"].fillna(value = "NA", inplace = True)
train["BsmtCond"].fillna(value = "NA", inplace = True)
train["MiscFeature"].fillna(value = "NA", inplace = True)

train.loc[(train["BsmtExposure"].isna()) & (train["TotalBsmtSF"]==936), "BsmtExposure"] = train["BsmtExposure"].mode()[0]
train["BsmtExposure"].fillna(value = "NA", inplace = True)

train_filtered = train.loc[train["BsmtFinType2"] != 'Unf']
train.loc[(train["BsmtFinType2"].isna()) & (train["BsmtFinSF2"]!=0), "BsmtFinType2"] = train_filtered["BsmtFinType2"].mode()[0]
train["BsmtFinType2"].fillna(value = "NA", inplace = True)

In [354]:
train[list(categorical_features)].isna().sum()

Electrical       0
ExterCond        0
Fence            0
HeatingQC        0
LotConfig        0
MasVnrType       0
Alley            0
LotShape         0
Condition2       0
Foundation       0
MSZoning         0
Neighborhood     0
Heating          0
Exterior1st      0
SaleCondition    0
BsmtFinType1     0
KitchenQual      0
Utilities        0
Street           0
BsmtExposure     0
LandSlope        0
HouseStyle       0
PavedDrive       0
GarageType       0
RoofStyle        0
RoofMatl         0
Condition1       0
SaleType         0
Functional       0
FireplaceQu      0
MiscFeature      0
CentralAir       0
BsmtQual         0
ExterQual        0
GarageQual       0
Exterior2nd      0
BsmtCond         0
BldgType         0
LandContour      0
GarageFinish     0
BsmtFinType2     0
PoolQC           0
GarageCond       0
dtype: int64

In [355]:
# one hot encoding for categorical features
for cat_col in categorical_features:
    train = pd.concat([train, pd.get_dummies(train[cat_col])], axis = 1)
    train = train.drop(cat_col, axis = 1)

In [356]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Ex,Fa,Gd,NA,Ex.1,Fa.1,Gd.1,NA.1,Po,TA
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,0,1
1,2,20,65.0,9600,6,8,1976,1976,196.0,978,...,0,0,0,1,0,0,0,0,0,1
2,3,60,65.0,11250,7,5,2001,2002,196.0,486,...,0,0,0,1,0,0,0,0,0,1
3,4,70,65.0,9550,7,5,1915,1970,196.0,216,...,0,0,0,1,0,0,0,0,0,1
4,5,60,65.0,14260,8,5,2000,2000,196.0,655,...,0,0,0,1,0,0,0,0,0,1


In [357]:
x_labels = [x for x in list(train.columns) if x != 'SalePrice']
y_labels = ['SalePrice']

In [358]:
# split the dataset into train and test using the seed 219 to make sure the results are replicable
x_train, x_test, y_train, y_test = train_test_split(
            train[x_labels], # x-variables (can change this)
            train[y_labels], # y-variable  (can change this)
            test_size    = 0.25,
            random_state = 219)

#### Linear Regression

In [361]:
# train linear regression model using log Y
reg = linear_model.LinearRegression().fit(x_train, np.log(y_train))

In [362]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), reg.predict(x_test))

561323291.222871

#### Lasso

In [363]:
# train lasso using log Y
lasso = linear_model.Lasso(alpha=0.04).fit(x_train, np.log(y_train))

In [364]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), lasso.predict(x_test))

0.021896143141649296

#### Random Forest Regressor

In [365]:
# train random forest regressor using log Y
rf = ensemble.RandomForestRegressor(n_estimators=100).fit(x_train, np.log(y_train))

  rf = ensemble.RandomForestRegressor(n_estimators=100).fit(x_train, np.log(y_train))


In [366]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), rf.predict(x_test))

0.019130468561877896

#### Gradient boosting regressor

In [371]:
gbr = ensemble.GradientBoostingRegressor(n_estimators = 70, criterion='squared_error', learning_rate = 0.05, random_state = 42).fit(x_train, np.log(y_train))

  y = column_or_1d(y, warn=True)


In [372]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), gbr.predict(x_test))

0.021269341263232495