# House prices



In [205]:
# import libraries
import matplotlib.pyplot as plt                      # data visualization
import pandas as pd                                  # data science essentials
from sklearn.model_selection import train_test_split # train-test split
import sklearn.linear_model as linear_model          # linear modeling in scikit-learn
import sklearn.ensemble as ensemble                  # tree regressor in scikit-learn
import numpy as np                                   # numpy library for math functions and arrays
from sklearn.metrics import accuracy_score, make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline             import Pipeline
from sklearn.tree import DecisionTreeRegressor
import time
from sklearn.model_selection import GridSearchCV
import datetime

In [206]:
train = pd.read_csv('__datasets/train.csv')
train = train.set_index('Id')
test = pd.read_csv('__datasets/test.csv')
test = test.set_index('Id')
train['is_train'] = True
test['is_train'] = False

all_data = pd.concat([train,test], axis=0)
# all_data =all_data.set_index('Id')

In [207]:
all_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,is_train
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,,,0,2,2008,WD,Normal,208500.0,True
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,,,0,5,2007,WD,Normal,181500.0,True
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,,,0,9,2008,WD,Normal,223500.0,True
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,,,0,2,2006,WD,Abnorml,140000.0,True
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,,,0,12,2008,WD,Normal,250000.0,True


In [208]:
all_data.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Data cleaning and exploration

In [209]:
# columns_with_null = train.isna().sum().index
# drop null values to start with, might need to manage null values to avoud losing data
all_data.isna().sum()

MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
Street              0
                 ... 
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
is_train            0
Length: 81, dtype: int64

## Cleaning numeric features

In [210]:
numeric_features = all_data.select_dtypes(include=np.number).columns
# numeric_features_test = test.select_dtypes(include=np.number).columns

In [211]:
#filling missing values for numeric values
for column in numeric_features:
    if all_data[column].isna().sum() > 0:
        column_median = all_data[column].median()
        all_data[column] = all_data[column].fillna(column_median)

In [212]:
all_data[numeric_features].isna().sum()

MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

## Cleaning categorical features

In [213]:
categorical_features = set(all_data.columns) - set(numeric_features) - set(['is_train'])
categorical_features

{'Alley',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'MiscFeature',
 'Neighborhood',
 'PavedDrive',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities'}

In [214]:
def fill_with_mode(df, columns):
    for col in columns:
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
        
    return df
        

In [215]:

# clean categorical features using mode for train data
all_data = fill_with_mode(all_data, ['Electrical', 'Fence', 'MasVnrType', 'Functional'])

# STUFF I ADDED LATER
all_data = fill_with_mode(all_data, ['MSZoning', 'KitchenQual', 'Utilities', 'Exterior1st','Exterior2nd','SaleType'])


# electrical_mode = all_data['Electrical'].mode()[0]
# all_data['Electrical'] = all_data['Electrical'].fillna(electrical_mode)
# fence_mode = all_data['Fence'].mode()[0]
# all_data['Fence'] = all_data['Fence'].fillna(fence_mode)
# masvnrtype_mode = all_data['MasVnrType'].mode()[0]
# all_data['MasVnrType'] = all_data['MasVnrType'].fillna(masvnrtype_mode)
# functional_mode = all_data.Functional.mode()[0]

# all_data['Functional'] = all_data.Functional.fillna(functional_mode)





# cleaning categorical features using mode for test data
# electrical_mode = test['Electrical'].mode()
# test['Electrical'] = test['Electrical'] = test['Electrical'].fillna(electrical_mode)[0]
# fence_mode = test['Fence'].mode()
# test['Fence'] = test['Fence'].fillna(fence_mode)[0]
# masvnrtype_mode = test['MasVnrType'].mode()
# test['MasVnrType'] = test['MasVnrType'].fillna(masvnrtype_mode)[0]

In [216]:
# clean other categorical features using "NA" where it's an option for train set
all_data["FireplaceQu"].fillna(value = "NA", inplace = True)
all_data["GarageCond"].fillna(value = "NA", inplace = True)
all_data["GarageYrBlt"].fillna(value = "NA", inplace = True)
all_data["GarageType"].fillna(value = "NA", inplace = True)
all_data["GarageFinish"].fillna(value = "NA", inplace = True)
all_data["PoolQC"].fillna(value = "NA", inplace = True)
all_data["BsmtQual"].fillna(value = "NA", inplace = True)
all_data["GarageQual"].fillna(value = "NA", inplace = True)
all_data["Alley"].fillna(value = "NA", inplace = True)
all_data["BsmtFinType1"].fillna(value = "NA", inplace = True)
all_data["BsmtFinType2"].fillna(value = "NA", inplace = True)
all_data["BsmtCond"].fillna(value = "NA", inplace = True)
all_data["MiscFeature"].fillna(value = "NA", inplace = True)

# clean other categorical features using "NA" where it's an option for test set
# test["FireplaceQu"].fillna(value = "NA", inplace = True)
# test["GarageCond"].fillna(value = "NA", inplace = True)
# test["GarageYrBlt"].fillna(value = "NA", inplace = True)
# test["GarageType"].fillna(value = "NA", inplace = True)
# test["GarageFinish"].fillna(value = "NA", inplace = True)
# test["PoolQC"].fillna(value = "NA", inplace = True)
# test["BsmtQual"].fillna(value = "NA", inplace = True)
# test["GarageQual"].fillna(value = "NA", inplace = True)
# test["Alley"].fillna(value = "NA", inplace = True)
# test["BsmtFinType1"].fillna(value = "NA", inplace = True)
# test["BsmtFinType2"].fillna(value = "NA", inplace = True)
# test["BsmtCond"].fillna(value = "NA", inplace = True)
# test["MiscFeature"].fillna(value = "NA", inplace = True)

# cleaning for train set
all_data.loc[(all_data["BsmtExposure"].isna()) & (all_data["TotalBsmtSF"]==936), "BsmtExposure"] = all_data["BsmtExposure"].mode()[0]
all_data["BsmtExposure"].fillna(value = "NA", inplace = True)

all_data_filtered = all_data.loc[all_data["BsmtFinType2"] != 'Unf']
all_data.loc[(all_data["BsmtFinType2"].isna()) & (all_data["BsmtFinSF2"]!=0), "BsmtFinType2"] = all_data_filtered["BsmtFinType2"].mode()[0]
all_data["BsmtFinType2"].fillna(value = "NA", inplace = True)

# cleaning for test set
# test.loc[(test["BsmtExposure"].isna()) & (test["TotalBsmtSF"]==936), "BsmtExposure"] = test["BsmtExposure"].mode()[0]
# test["BsmtExposure"].fillna(value = "NA", inplace = True)

# test_filtered = train.loc[train["BsmtFinType2"] != 'Unf']
# test.loc[(test["BsmtFinType2"].isna()) & (test["BsmtFinSF2"]!=0), "BsmtFinType2"] = test_filtered["BsmtFinType2"].mode()[0]
# test["BsmtFinType2"].fillna(value = "NA", inplace = True)

In [217]:
all_data[list(categorical_features)].isna().sum()

Neighborhood     0
GarageCond       0
Condition2       0
SaleType         0
BsmtFinType1     0
KitchenQual      0
RoofMatl         0
HouseStyle       0
Electrical       0
ExterQual        0
BsmtFinType2     0
Condition1       0
FireplaceQu      0
Alley            0
MiscFeature      0
Street           0
GarageType       0
GarageFinish     0
SaleCondition    0
BldgType         0
Utilities        0
Exterior2nd      0
LotShape         0
Foundation       0
MSZoning         0
PoolQC           0
Functional       0
ExterCond        0
BsmtCond         0
Heating          0
Fence            0
MasVnrType       0
GarageQual       0
BsmtExposure     0
Exterior1st      0
BsmtQual         0
RoofStyle        0
LandSlope        0
LotConfig        0
CentralAir       0
HeatingQC        0
PavedDrive       0
LandContour      0
dtype: int64

In [218]:
all_data['tot_SF'] = 1

for index, row in all_data.iterrows():
    val = row['GarageFinish']
    # print(val)
    if val != 'Unf':
        all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF'] + all_data.loc[index,'GarageArea']
        # print(row['tot_SF'])
    else:
        all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF']
        # print(row['tot_SF'])

for index, row in all_data.iterrows():        
    if row['BedroomAbvGr'] > 0:
        all_data.loc[index,'BathPerBedroom'] = (all_data.loc[index,'FullBath'] +  all_data.loc[index,'HalfBath']) / all_data.loc[index,'BedroomAbvGr']
        # if all_data.loc[index,'BathPerBedroom'] == None:
        #     print(all_data.loc[index,'FullBath'])
    else:
        all_data.loc[index,'BathPerBedroom'] = 0

all_data['log_tot_SF'] = np.log(all_data['tot_SF']+0.001)

# total porch area 
all_data['total_porch'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# has a feature 
all_data['has_garage'] = 0
all_data['has_masvnr'] = 0
all_data['has_pool']   = 0
all_data['has_porch']  = 0

for index, row in all_data.iterrows():
    # there is a garage area
    if all_data.loc[index,'GarageArea'] > 0:
        all_data.loc[index,'has_garage'] = 1
        # all_data.loc[index, 'has_garage'] = 1
    # there is a masvnr area
    if all_data.loc[index,'MasVnrArea'] > 0:
        all_data.loc[index,'has_masvnr'] = 1 
    # there is a pool area
    if all_data.loc[index,'PoolArea'] > 0:
        all_data.loc[index,'has_pool'] = 1
    # there is a pool area
    if all_data.loc[index,'total_porch'] > 0:
        all_data.loc[index,'has_porch'] = 1
    
        
all_data['TotBath'] = all_data['FullBath'] +  all_data['HalfBath'] +  all_data['BsmtFullBath'] + all_data['BsmtHalfBath']
all_data['FireplaceFreq'] =  all_data['Fireplaces'] / all_data['TotRmsAbvGrd']
all_data['OutdoorAmenitiesArea'] = all_data['WoodDeckSF'] + all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch'] + all_data['PoolArea'] 
all_data['log_OutdoorAmenitiesArea'] = np.log(all_data['OutdoorAmenitiesArea']+0.001)
all_data['log_FireplaceFreq'] = np.log(all_data['FireplaceFreq']+0.001)


# all_data['BathPerBedroom'] = (all_data['FullBath'] +  all_data['HalfBath']) / all_data['BedroomAbvGr']


all_data['house_age'] = datetime.datetime.now().year - all_data['YearBuilt']
all_data['remodel_age'] = datetime.datetime.now().year - all_data['YearRemodAdd']

In [219]:
# ['tot_SF', 'total_porch', 'FireplaceFreq', 'OutdoorAmenitiesArea']
all_data['BathPerBedroom'].unique()

array([1.        , 0.66666667, 0.33333333, 0.75      , 2.        ,
       0.5       , 0.        , 1.33333333, 1.5       , 0.6       ,
       0.25      , 0.2       , 0.4       , 0.8       , 1.66666667])

In [220]:
####################### SKEWNESS ############################
# set a threshold for skewness
skew_threshold = 0.5

# calculate the skewness of each column
skewness = all_data.skew()

# create a list of column names where the absolute value of skewness is greater than the threshold
skewed_columns = list(skewness[abs(skewness) > skew_threshold].index)

####################### LOG TRANSFORMATIONS ############################
# list of skewed columns
# removed year built, year remodeled
skewed_columns = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 
                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 
                  'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
                  'ScreenPorch', 'PoolArea', 'MiscVal']
# log skewed clolumns and create a new column for them 
for col in skewed_columns:
    if col in all_data.columns:
        all_data['log_' + col] = np.log(all_data[col] + 0.001)

########################### DATA DROP ############################## 
# dropping skewed columns, but keep the logged ones
all_data = all_data.drop(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 
                  'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
                  'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 
                  'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
                  'ScreenPorch', 'PoolArea', 'MiscVal', 
                  'tot_SF', 'total_porch', 'FireplaceFreq', 'OutdoorAmenitiesArea'],
                axis = 1)

  skewness = all_data.skew()


In [221]:
# ####################### SKEWNESS for test set ############################
# # set a threshold for skewness
# skew_threshold_test = 0.5

# # calculate the skewness of each column
# skewness_test = test[numeric_features_test].skew()

# # create a list of column names where the absolute value of skewness is greater than the threshold
# skewed_columns_test = list(skewness_test[abs(skewness_test) > skew_threshold_test].index)

# # log skewed clolumns and create a new column for them 
# for col in skewed_columns_test:
#     if col in numeric_features:
#         test['log_' + col] = np.log(test[col] + 0.001)   

# test.drop(skewed_columns_test, axis =1)

In [222]:
# all_data['MSSubClass'] = all_data['MSSubClass'].replace([150], [160])
# all_data['HouseStyle'] = all_data['HouseStyle'].replace(["2.5Fin"], ["2Story"])
# all_data['Exterior1st'] = all_data['Exterior1st'].replace(["ImStucc","Stone"], ["CemntBd","CemntBd"])
# all_data['Exterior2nd'] = all_data['Exterior2nd'].replace(["Other"], ["ImStucc"])
# all_data['FullBath'] = all_data['FullBath'].replace(["4"], ["3"])
# all_data['BedroomAbvGr'] = all_data['BedroomAbvGr'].replace(["8"], ["6"])
# all_data['TotRmsAbvGrd'] = all_data['TotRmsAbvGrd'].replace(["14",'2'], ["12",'3'])
# all_data['Fireplaces'] = all_data['Fireplaces'].replace(["4"], ["3"])
# all_data['GarageCars'] = all_data['GarageCars'].replace(["5.0"], ["4.0"])
# all_data['GarageQual'] = all_data['GarageQual'].replace(["Ex"], ["Gd"])
# all_data['MiscFeature'] = all_data['MiscFeature'].replace(["TenC"], ["Gar2"])

In [223]:
# all_data["BathPerBedroom"].fillna(value = "NA", inplace = True)

# One hot encoding

In [224]:
list(all_data.columns)

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'FullBath',
 'BedroomAbvGr',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition',
 'SalePrice',
 'is_train',
 'BathPerBedroom',
 'log_tot_SF',
 'has_garage',
 'has_masvnr',
 'has_pool',
 'has_porch',
 'TotBath',
 'log_OutdoorAmenitiesArea',
 'log_FireplaceFreq',
 'house_age',
 'remodel_age',
 'log_MSSubClass',
 'l

In [225]:

# one hot encoding for categorical features for train set
for cat_col in categorical_features:
    print(cat_col)
    one_hot_encoded_features = pd.get_dummies(all_data[cat_col])
    one_hot_encoded_features.columns = [cat_col + col for col in one_hot_encoded_features.columns]
    all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1)
    all_data = all_data.drop(cat_col, axis = 1)
    

Neighborhood
GarageCond
Condition2
SaleType
BsmtFinType1
KitchenQual
RoofMatl
HouseStyle
Electrical
ExterQual
BsmtFinType2
Condition1
FireplaceQu
Alley
MiscFeature
Street
GarageType
GarageFinish
SaleCondition
BldgType
Utilities
Exterior2nd
LotShape
Foundation
MSZoning
PoolQC
Functional
ExterCond
BsmtCond
Heating
Fence
MasVnrType
GarageQual
BsmtExposure
Exterior1st
BsmtQual
RoofStyle
LandSlope
LotConfig
CentralAir
HeatingQC
PavedDrive
LandContour


In [226]:
all_data.isna().any(axis=0).any()


False

In [227]:
train = all_data[all_data.is_train==True]
test = all_data[all_data.is_train==False]

train = train.drop('is_train', axis = 1)
test = test.drop('is_train', axis = 1)





In [228]:
train.head()

Unnamed: 0_level_0,OverallQual,YearBuilt,YearRemodAdd,FullBath,BedroomAbvGr,GarageYrBlt,GarageCars,GarageArea,MoSold,YrSold,...,HeatingQCGd,HeatingQCPo,HeatingQCTA,PavedDriveN,PavedDriveP,PavedDriveY,LandContourBnk,LandContourHLS,LandContourLow,LandContourLvl
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,2003,2003,2,3,2003.0,2.0,548.0,2,2008,...,0,0,0,0,0,1,0,0,0,1
2,6,1976,1976,2,3,1976.0,2.0,460.0,5,2007,...,0,0,0,0,0,1,0,0,0,1
3,7,2001,2002,2,3,2001.0,2.0,608.0,9,2008,...,0,0,0,0,0,1,0,0,0,1
4,7,1915,1970,1,3,1998.0,3.0,642.0,2,2006,...,1,0,0,0,0,1,0,0,0,1
5,8,2000,2000,2,4,2000.0,3.0,836.0,12,2008,...,0,0,0,0,0,1,0,0,0,1


In [229]:
test.head()

Unnamed: 0_level_0,OverallQual,YearBuilt,YearRemodAdd,FullBath,BedroomAbvGr,GarageYrBlt,GarageCars,GarageArea,MoSold,YrSold,...,HeatingQCGd,HeatingQCPo,HeatingQCTA,PavedDriveN,PavedDriveP,PavedDriveY,LandContourBnk,LandContourHLS,LandContourLow,LandContourLvl
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,5,1961,1961,1,2,1961.0,1.0,730.0,6,2010,...,0,0,1,0,0,1,0,0,0,1
1462,6,1958,1958,1,3,1958.0,1.0,312.0,6,2010,...,0,0,1,0,0,1,0,0,0,1
1463,5,1997,1998,2,3,1997.0,2.0,482.0,3,2010,...,1,0,0,0,0,1,0,0,0,1
1464,6,1998,1998,2,3,1998.0,2.0,470.0,6,2010,...,0,0,0,0,0,1,0,0,0,1
1465,8,1992,1992,2,2,1992.0,2.0,506.0,1,2010,...,0,0,0,0,0,1,0,1,0,0


In [230]:
x_labels = [x for x in list(train.columns) if x != 'SalePrice' and x!= 'Id']
y_labels = ['SalePrice']

In [231]:
# split the dataset into train and test using the seed 219 to make sure the results are replicable
x_train, x_test, y_train, y_test = train_test_split(
            train[x_labels], # x-variables (can change this)
            train[y_labels], # y-variable  (can change this)
            test_size    = 0.25,
            random_state = 219)

In [232]:
# for column in all_data.columns:
#     if all_data[column].isna().sum() > 0:
#         print(column)
#         print(all_data[column])

# all_data["BathPerBedroom"].fillna(value = "NA", inplace = True)

#### Linear Regression

In [235]:
# train linear regression model using log Y
reg = linear_model.LinearRegression().fit(x_train, np.log(y_train))

In [236]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), reg.predict(x_test))

1522824502985.7283

#### Lasso

In [237]:
# train lasso using log Y
lasso = linear_model.Lasso(alpha=0.04).fit(x_train, np.log(y_train))

In [238]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), lasso.predict(x_test))

0.030228827096328063

#### Ridge

In [239]:
ridge = linear_model.Ridge(alpha=0.001).fit(x_train, np.log(y_train))
mean_squared_error(np.log(y_test), ridge.predict(x_test))

0.026840916780526095

#### Random Forest Regressor

In [240]:
# train random forest regressor using log Y
rf = ensemble.RandomForestRegressor(n_estimators=148, random_state = 42, max_depth = 400).fit(x_train, np.log(np.ravel(y_train)))

In [241]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), rf.predict(x_test))

0.017251348920486364

#### Gradient boosting regressor

In [242]:
gbr = ensemble.GradientBoostingRegressor(n_estimators = 70, criterion='squared_error', learning_rate = 0.05, random_state = 42).fit(x_train, np.log(np.ravel(y_train)))

In [243]:
# calculate the score of the linear regression model using the test set
mean_squared_error(np.log(y_test), gbr.predict(x_test))

0.018666737755624663

### Hyperparameter tunning

In [246]:
from tpot                         import TPOTRegressor
# offspring_size = 20,
def tpot_rmse(x_train, y_train, x_test, y_test, generations= 15,  population_size = 40):
    # Define custom scoring function as RMSE
    rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    # Define TPOTRegressor with custom scoring function
    tpot = TPOTRegressor(generations     = generations, 
                         population_size = population_size,
                         #offspring_size  = offspring_size,
                         scoring         = "neg_root_mean_squared_error",
                         verbosity       = 2,
                         random_state    = 42,
                         n_jobs          = -1)
    
    # Fit the TPOT Regressor to the training data
    tpot.fit(x_train, y_train)
    
    # Calculate predictions on test set
    y_pred = tpot.predict(x_test)
    
    # Calculate RMSE on test set
    rmse = mean_squared_error(y_test, y_pred)
    
    return tpot, rmse

tpot, rmse = tpot_rmse(x_train, np.log(np.ravel(y_train)), x_test, np.log(np.ravel(y_test)), generations=10, population_size=75)

Optimization Progress:   0%|          | 0/825 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.13169253606273906

Generation 2 - Current best internal CV score: -0.1316457681655923

Generation 3 - Current best internal CV score: -0.13088761399045154

Generation 4 - Current best internal CV score: -0.13088761399045154

Generation 5 - Current best internal CV score: -0.12828717873322373

Generation 6 - Current best internal CV score: -0.12755314102621765

Generation 7 - Current best internal CV score: -0.12755314102621765

Generation 8 - Current best internal CV score: -0.12447715417761518

Generation 9 - Current best internal CV score: -0.12447715417761518

Generation 10 - Current best internal CV score: -0.12447715417761518

Best pipeline: RandomForestRegressor(LassoLarsCV(input_matrix, normalize=False), bootstrap=False, max_features=0.35000000000000003, min_samples_leaf=2, min_samples_split=9, n_estimators=100)




In [249]:
np.sqrt(rmse)

0.10894657481923675

In [250]:
# # from sklearn.model_selection import GridSearchCV

# parameters = {'max_depth': range(50, 1051,100), 'n_estimators': [20, 50, 200, 400, 600, 800], 'min_samples_split': [2, 10, 20, 3,4,5], 'min_samples_leaf': [1, 2,3,4]}

# # rfc_ht = ensemble.RandomForestRegressor(random_state=42)
# clf = GridSearchCV(rfc_ht, parameters, scoring='neg_mean_squared_error', cv=5)
# clf.fit(x_train, np.log(np.ravel(y_train)))

# best_params = clf.best_params_
# print(f"Best parameters: {best_params}")

# best_rfc_ht = ensemble.RandomForestRegressor(random_state=42, **best_params)
# best_rfc_ht.fit(x_train, np.log(np.ravel(y_train)))

# mse = mean_squared_error(np.log(y_test), best_rfc_ht.predict(x_test))
# print(f"Mean squared error: {mse}")


## Saving Y_test for submission

In [251]:
test[x_labels]

Unnamed: 0_level_0,OverallQual,YearBuilt,YearRemodAdd,FullBath,BedroomAbvGr,GarageYrBlt,GarageCars,GarageArea,MoSold,YrSold,...,HeatingQCGd,HeatingQCPo,HeatingQCTA,PavedDriveN,PavedDriveP,PavedDriveY,LandContourBnk,LandContourHLS,LandContourLow,LandContourLvl
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,5,1961,1961,1,2,1961.0,1.0,730.0,6,2010,...,0,0,1,0,0,1,0,0,0,1
1462,6,1958,1958,1,3,1958.0,1.0,312.0,6,2010,...,0,0,1,0,0,1,0,0,0,1
1463,5,1997,1998,2,3,1997.0,2.0,482.0,3,2010,...,1,0,0,0,0,1,0,0,0,1
1464,6,1998,1998,2,3,1998.0,2.0,470.0,6,2010,...,0,0,0,0,0,1,0,0,0,1
1465,8,1992,1992,2,2,1992.0,2.0,506.0,1,2010,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,4,1970,1970,1,3,1979.0,0.0,0.0,6,2006,...,1,0,0,0,0,1,0,0,0,1
2916,4,1970,1970,1,3,1970.0,1.0,286.0,4,2006,...,0,0,1,0,0,1,0,0,0,1
2917,5,1960,1996,1,4,1960.0,2.0,576.0,9,2006,...,0,0,0,0,0,1,0,0,0,1
2918,5,1992,1992,1,3,1979.0,0.0,0.0,7,2006,...,0,0,1,0,0,1,0,0,0,1


In [252]:
y_prediction = tpot.predict(test[x_labels])
np.exp(y_prediction)



array([124363.34336931, 155310.38120913, 173669.55922707, ...,
       163387.19844411, 117494.06213411, 233335.84151178])

In [253]:
submission = pd.DataFrame()
submission['Id'] = test.index
submission['SalePrice'] = np.exp(y_prediction)

submission = submission.set_index('Id')

In [254]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,124363.343369
1462,155310.381209
1463,173669.559227
1464,191476.37238
1465,193498.697738


In [255]:
submission.to_csv('submission_team10_tpot.csv')