In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Prepare Data

In [2]:
from sklearn.model_selection import train_test_split

## Overview of data

In [3]:
dataset = pd.read_csv('data/train.csv').fillna(method='ffill')
dataset.shape

(1460, 81)

In [4]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
dataset.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.104795,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.492466,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,23.846996,9981.264932,1.382997,1.112799,30.202904,20.645407,180.795612,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,70.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,165.25,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## Feature Engineering

In [6]:
cols = []

# get only for numeric and non-null columns
temp_dataset = dataset.select_dtypes(exclude=['object'])
for col in temp_dataset.columns:
    if not dataset[col].isnull().any():
        cols.append(col)

In [7]:
# column overrided
# cols = ['LotArea','OverallQual', 'GarageArea', 'YearBuilt', 'EnclosedPorch', 'Id', 'SalePrice']

In [8]:
X = dataset[cols].drop(['Id', 'SalePrice'], axis=1).values
y = dataset['SalePrice'].values

## Outlier Elimination

In [9]:
Q1 = dataset['SalePrice'].quantile(0.25)
Q3 = dataset['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

84025.0


In [10]:
less_than_IQR = dataset['SalePrice'] < (Q1 - 1.5 * IQR)
greater_than_IQR = dataset['SalePrice'] > (Q3 + 1.5 * IQR)

print(less_than_IQR[less_than_IQR])
print(greater_than_IQR[greater_than_IQR])

Series([], Name: SalePrice, dtype: bool)
11      True
53      True
58      True
112     True
151     True
        ... 
1268    True
1353    True
1373    True
1388    True
1437    True
Name: SalePrice, Length: 61, dtype: bool


In [11]:
outlier_eliminated_dataset = dataset.copy()

outlier_eliminated_dataset = outlier_eliminated_dataset[~outlier_eliminated_dataset['Id'].isin(less_than_IQR[less_than_IQR].index)]
outlier_eliminated_dataset = outlier_eliminated_dataset[~outlier_eliminated_dataset['Id'].isin(greater_than_IQR[greater_than_IQR].index)]

In [12]:
print(dataset.shape)
print(outlier_eliminated_dataset.shape)

(1460, 81)
(1399, 81)


In [13]:
X_outlier_eliminated = outlier_eliminated_dataset[cols].drop(['Id', 'SalePrice'], axis=1).values
y_outlier_eliminated = outlier_eliminated_dataset['SalePrice'].values

## Encode Categorical Features

In [14]:
def encode_column(encoded_dataset):
    encoded_dataset['MSZoning'] = encoded_dataset['MSZoning'].replace({'A': 0, 'C': 1, 'C (all)': 1, 'FV': 2, 'I': 3, 'RH': 4, 'RL': 5, 'RP': 6, 'RM': 7 }).astype(int)
    encoded_dataset['Street'] = encoded_dataset['Street'].replace({'Grvl': 0, 'Pave': 1 }).astype(int)
    encoded_dataset['Alley'] = encoded_dataset['Alley'].replace({'Grvl': 0, 'Pave': 1, 'NA': 2 }).astype(int)
    encoded_dataset['LotShape'] = encoded_dataset['LotShape'].replace({'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3 }).astype(int)
    encoded_dataset['LandContour'] = encoded_dataset['LandContour'].replace({'Lvl': 0, 'Bnk': 1, 'HLS': 2, 'Low': 3 }).astype(int)
    encoded_dataset['Utilities'] = encoded_dataset['Utilities'].replace({'AllPub': 0, 'NoSewr': 1, 'NoSeWa': 2, 'ELO': 3 }).astype(int)
    encoded_dataset['LotConfig'] = encoded_dataset['LotConfig'].replace({'Inside': 0, 'Corner': 1, 'CulDSac': 2, 'FR2': 3, 'FR3': 4 }).astype(int)
    encoded_dataset['LandSlope'] = encoded_dataset['LandSlope'].replace({'Gtl': 0, 'Mod': 1, 'Sev': 2 }).astype(int)
    encoded_dataset['Neighborhood'] = encoded_dataset['Neighborhood'].replace({'Blmngtn': 0, 'Blueste': 1, 'BrDale': 2, 'BrkSide': 3, 'ClearCr': 4, 'CollgCr': 5, 'Crawfor': 6, 'Edwards': 7, 'Gilbert': 8, 'IDOTRR': 9, 'MeadowV': 10, 'Mitchel': 11, 'Names': 12, 'NAmes': 12, 'NoRidge': 13, 'NPkVill': 14, 'NridgHt': 15, 'NWAmes': 16, 'OldTown': 17, 'SWISU': 18, 'Sawyer': 19, 'SawyerW': 20, 'Somerst': 21, 'StoneBr': 22, 'Timber': 23, 'Veenker': 24 }).astype(int)
    encoded_dataset['Condition1'] = encoded_dataset['Condition1'].replace({'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRNn': 3, 'RRAn': 4, 'PosN': 5, 'PosA': 6, 'RRNe': 7, 'RRAe': 8 }).astype(int)
    encoded_dataset['Condition2'] = encoded_dataset['Condition2'].replace({'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRNn': 3, 'RRAn': 4, 'PosN': 5, 'PosA': 6, 'RRNe': 7, 'RRAe': 8 }).astype(int)
    encoded_dataset['BldgType'] = encoded_dataset['BldgType'].replace({'1Fam': 0, '2FmCon': 1, '2fmCon': 1, 'Duplx': 2, 'Duplex': 2, 'TwnhsE': 3, 'Twnhs': 3, 'TwnhsI': 4 }).astype(int)
    encoded_dataset['HouseStyle'] = encoded_dataset['HouseStyle'].replace({'1Story': 0, '1.5Fin': 1, '1.5Unf': 2, '2Story': 3, '2.5Fin': 4, '2.5Unf': 5, 'SFoyer': 6, 'SLvl': 7 }).astype(int)
    encoded_dataset['RoofStyle'] = encoded_dataset['RoofStyle'].replace({'Flat': 0, 'Gable': 1, 'Gambrel': 2, 'Hip': 3, 'Mansard': 4, 'Shed': 5 }).astype(int)
    encoded_dataset['RoofMatl'] = encoded_dataset['RoofMatl'].replace({'ClyTile': 0, 'CompShg': 1, 'Membran': 2, 'Metal': 3, 'Roll': 4, 'Tar&Grv': 5, 'WdShake': 6, 'WdShngl': 7 }).astype(int)
    encoded_dataset['Exterior1st'] = encoded_dataset['Exterior1st'].replace({'AsbShng': 0, 'AsphShn': 1, 'BrkComm': 2, 'Brk Cmn': 2, 'BrkFace': 3, 'CBlock': 4, 'CemntBd': 5, 'CmentBd': 5, 'HdBoard': 6, 'ImStucc': 7, 'MetalSd': 8, 'Other': 9, 'Plywood': 10, 'PreCase': 11, 'Stone': 12, 'Stucco': 13, 'VinylSd': 14, 'Wd Sdng': 15, 'WdShing': 16, 'Wd Shng': 17 }).astype(int)
    encoded_dataset['Exterior2nd'] = encoded_dataset['Exterior2nd'].replace({'AsbShng': 0, 'AsphShn': 1, 'BrkComm': 2, 'Brk Cmn': 2, 'BrkFace': 3, 'CBlock': 4, 'CemntBd': 5, 'CmentBd': 5, 'HdBoard': 6, 'ImStucc': 7, 'MetalSd': 8, 'Other': 9, 'Plywood': 10, 'PreCase': 11, 'Stone': 12, 'Stucco': 13, 'VinylSd': 14, 'Wd Sdng': 15, 'WdShing': 16, 'Wd Shng': 17 }).astype(int)
    encoded_dataset['MasVnrType'] = encoded_dataset['MasVnrType'].replace({'BrkCmn': 0, 'BrkFace': 1, 'CBlock': 2, 'None': 3, 'Stone': 4 }).astype(int)
    encoded_dataset['ExterQual'] = encoded_dataset['ExterQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['ExterCond'] = encoded_dataset['ExterCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['Foundation'] = encoded_dataset['Foundation'].replace({'BrkTil': 0, 'CBlock': 1, 'PConc': 2, 'Slab': 3, 'Stone': 4, 'Wood': 5 }).astype(int)
    encoded_dataset['BsmtQual'] = encoded_dataset['BsmtQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtCond'] = encoded_dataset['BsmtCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtExposure'] = encoded_dataset['BsmtExposure'].replace({'Gd': 0, 'Av': 1, 'Mn': 2, 'No': 3, 'NA': 5 }).astype(int)
    encoded_dataset['BsmtFinType1'] = encoded_dataset['BsmtFinType1'].replace({'GLQ': 0, 'ALQ': 1, 'BLQ': 2, 'Rec': 3, 'LwQ': 4, 'Unf': 5, 'NA': 6 }).astype(int)
    encoded_dataset['BsmtFinType2'] = encoded_dataset['BsmtFinType2'].replace({'GLQ': 0, 'ALQ': 1, 'BLQ': 2, 'Rec': 3, 'LwQ': 4, 'Unf': 5, 'NA': 6 }).astype(int)
    encoded_dataset['Heating'] = encoded_dataset['Heating'].replace({'Floor': 0, 'GasA': 1, 'GasW': 2, 'Grav': 3, 'OthW': 4, 'Wall': 5 }).astype(int)
    encoded_dataset['HeatingQC'] = encoded_dataset['HeatingQC'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['CentralAir'] = encoded_dataset['CentralAir'].replace({'N': 0, 'Y': 1 }).astype(int)
    encoded_dataset['Electrical'] = encoded_dataset['Electrical'].replace({'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4 }).astype(int)
    encoded_dataset['KitchenQual'] = encoded_dataset['KitchenQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4 }).astype(int)
    encoded_dataset['Functional'] = encoded_dataset['Functional'].replace({'Typ': 0, 'Min1': 1, 'Min2': 2, 'Mod': 3, 'Maj1': 4, 'Maj2': 5, 'Sev': 6, 'Sal': 7 }).astype(int) 
    encoded_dataset['FireplaceQu'] = encoded_dataset['FireplaceQu'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['GarageType'] = encoded_dataset['GarageType'].replace({'2Types': 0, 'Attchd': 1, 'Basment': 2, 'BuiltIn': 3, 'CarPort': 4, 'Detchd': 5, 'NA': 6 }).astype(int)
    encoded_dataset['GarageFinish'] = encoded_dataset['GarageFinish'].replace({'Fin': 0, 'RFn': 1, 'Unf': 2, 'NA': 3 }).astype(int)
    encoded_dataset['GarageQual'] = encoded_dataset['GarageQual'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['GarageCond'] = encoded_dataset['GarageCond'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['PavedDrive'] = encoded_dataset['PavedDrive'].replace({'Y': 0, 'P': 1, 'N': 2 }).astype(int)
    encoded_dataset['PoolQC'] = encoded_dataset['PoolQC'].replace({'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'Po': 4, 'NA': 5 }).astype(int)
    encoded_dataset['Fence'] = encoded_dataset['Fence'].replace({'GdPrv': 0, 'MnPrv': 1, 'GdWo': 2, 'MnWw': 3, 'NA': 4 }).astype(int)
    encoded_dataset['MiscFeature'] = encoded_dataset['MiscFeature'].replace({'Elev': 0, 'Gar2': 1, 'Othr': 2, 'Shed': 3, 'TenC': 4, 'NA': 5 }).astype(int)
    encoded_dataset['SaleType'] = encoded_dataset['SaleType'].replace({'WD': 0, 'CWD': 1, 'VWD': 2, 'New': 3, 'COD': 4, 'Con': 5, 'ConLw': 6, 'ConLI': 7, 'ConLD': 8, 'Oth': 9 }).astype(int)
    encoded_dataset['SaleCondition'] = encoded_dataset['SaleCondition'].replace({'Normal': 0, 'Abnorml': 1, 'AdjLand': 2, 'Alloca': 3, 'Family': 4, 'Partial': 5 }).astype(int)
    return encoded_dataset

In [15]:
encoded_dataset = dataset.copy()

encoded_dataset = encode_column(encoded_dataset.fillna(0))
encoded_dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,5,65.0,8450,1,0,0,0,0,...,0,0,0,0,0,2,2008,0,0,208500
1,2,20,5,80.0,9600,1,0,0,0,0,...,0,0,0,0,0,5,2007,0,0,181500
2,3,60,5,68.0,11250,1,0,1,0,0,...,0,0,0,0,0,9,2008,0,0,223500
3,4,70,5,60.0,9550,1,0,1,0,0,...,0,0,0,0,0,2,2006,0,1,140000
4,5,60,5,84.0,14260,1,0,1,0,0,...,0,0,0,0,0,12,2008,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,5,62.0,7917,1,1,0,0,0,...,0,1,2,4,0,8,2007,0,0,175000
1456,1457,20,5,85.0,13175,1,1,0,0,0,...,0,1,1,4,0,2,2010,0,0,210000
1457,1458,70,5,66.0,9042,1,1,0,0,0,...,0,1,0,3,2500,5,2010,0,0,266500
1458,1459,20,5,68.0,9717,1,1,0,0,0,...,0,1,0,3,0,4,2010,0,0,142125


In [16]:
X_encoded = encoded_dataset.drop(['Id', 'SalePrice'], axis=1).values
y_encoded = encoded_dataset['SalePrice'].values

## Eliminate Outlier of Encoded Dataset

In [17]:
def eliminate_outliers(dataset):
    for col in dataset.columns:
        Q1 = dataset[col].quantile(0.25)
        Q3 = dataset[col].quantile(0.75)
        IQR = Q3 - Q1
        less_than_IQR = dataset[col] < (Q1 - 1.5 * IQR)
        greater_than_IQR = dataset[col] > (Q3 + 1.5 * IQR)
        dataset = dataset[~dataset['Id'].isin(less_than_IQR[less_than_IQR].index)]
        dataset = dataset[~dataset['Id'].isin(greater_than_IQR[greater_than_IQR].index)]
    return dataset
        

In [18]:
perfect_dataset = encoded_dataset.copy()

perfect_dataset = eliminate_outliers(perfect_dataset)

In [19]:
print(dataset.shape)
print(perfect_dataset.shape)

(1460, 81)
(571, 81)


In [20]:
X_perfect = perfect_dataset.drop(['Id', 'SalePrice'], axis=1).values
y_perfect = perfect_dataset['SalePrice'].values

## Final Preprocessing

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
X_outlier_eliminated_train, X_outlier_eliminated_test, y_outlier_eliminated_train, y_outlier_eliminated_test = train_test_split(X_outlier_eliminated, y_outlier_eliminated, test_size=0.2, random_state=0)

In [23]:
X_encoded_train, X_encoded_test, y_encoded_train, y_encoded_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=0)

In [24]:
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_perfect, y_perfect, test_size=0.2, random_state=0)

### Test Dataset

In [25]:
test_dataset = pd.read_csv('data/test.csv').fillna(method='ffill')
test = test_dataset[cols[1:-1]].values

In [26]:
encoded_test_dataset = test_dataset.copy()
encoded_test_dataset = encode_column(encoded_test_dataset.fillna(0)).drop(['Id'], axis=1)

encoded_test_dataset

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,4,80.0,11622,1,0,0,0,0,0,...,120,0,0,1,0,0,6,2010,0,0
1,20,5,81.0,14267,1,0,1,0,0,1,...,0,0,0,1,1,12500,6,2010,0,0
2,60,5,74.0,13830,1,0,1,0,0,0,...,0,0,0,1,1,0,3,2010,0,0
3,60,5,78.0,9978,1,0,1,0,0,0,...,0,0,0,1,1,0,6,2010,0,0
4,120,5,43.0,5005,1,0,1,2,0,0,...,144,0,0,1,1,0,1,2010,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,7,21.0,1936,1,0,0,0,0,0,...,0,0,1,0,3,0,6,2006,0,0
1455,160,7,21.0,1894,1,0,0,0,0,0,...,0,0,1,0,3,0,4,2006,0,1
1456,20,5,160.0,20000,1,0,0,0,0,0,...,0,0,1,0,3,0,9,2006,0,1
1457,85,5,62.0,10441,1,0,0,0,0,0,...,0,0,1,1,3,700,7,2006,0,0


# Linear Regression

## Normal Dataset

In [27]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [28]:
print(regressor.intercept_)

1909.5445893417927


In [29]:
print(regressor.coef_)

[-1.28127404e+02  8.87330539e+01  4.41286875e-01  1.54681261e+04
  4.58114432e+03  2.58701465e+02  1.85171726e+02  3.20460117e+01
  1.84493905e+01  2.57696148e+00 -6.25660228e-01  2.04006918e+01
  1.69175472e+01  1.93859260e+01 -4.43232607e+00  3.18711471e+01
  2.56218079e+03  5.27307851e+02  3.31952779e+03 -4.30272727e+02
 -1.12879360e+04 -1.61207346e+04  5.37489489e+03  3.70423116e+03
  7.59907827e+01  2.25988997e+03  2.24917855e+01  1.54396911e+01
  5.20468326e+00  1.15101208e+00  2.43954964e+01  2.27630834e+01
  2.30850030e+01 -6.95311592e-01 -3.49209926e+02 -5.42089871e+02]


In [30]:
y_pred = regressor.predict(X_test)

In [31]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,251258.700134
1,133000,152622.384850
2,110000,98405.489090
3,192000,228751.384123
4,88000,104352.066536
...,...,...
287,324000,272841.538187
288,555000,424507.990372
289,136000,185578.147471
290,82500,62103.654306


In [32]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 24009.701100958413
Mean Squared Error: 2500505189.7061496
Root Mean Squared Error: 50005.05164187064


In [33]:
test_pred = regressor.predict(test)

In [34]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': test_pred})
df.to_csv('linear-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [35]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression(normalize=True)
regressor.fit(X_train, y_train)

LinearRegression(normalize=True)

In [36]:
print(regressor.intercept_)

-45847.101027397264


In [37]:
print(regressor.coef_)

[-1.28656117e+02  8.71878116e+01  4.33970354e-01  1.55388842e+04
  4.62831220e+03  2.59386388e+02  1.83219710e+02  3.19809502e+01
  1.76671781e+14  1.76671781e+14  1.76671781e+14 -1.76671781e+14
 -2.08078644e+14 -2.08078644e+14 -2.08078644e+14  2.08078644e+14
  2.57428590e+03  6.36761129e+02  3.36617337e+03 -4.21076602e+02
 -1.13163943e+04 -1.62962236e+04  5.40233293e+03  3.67475461e+03
  7.50874404e+01  2.39744671e+03  2.20626996e+01  1.54320088e+01
  5.83132209e+00  1.44436267e+00  2.23696611e+01  2.27526994e+01
  2.31773165e+01 -6.62452455e-01 -3.32435146e+02 -5.16427779e+02]


In [38]:
y_pred = regressor.predict(X_test)

In [39]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,251048.898973
1,133000,152744.898973
2,110000,98536.898973
3,192000,228840.898973
4,88000,104392.898973
...,...,...
287,324000,273088.898973
288,555000,424936.898973
289,136000,186088.898973
290,82500,62312.898973


In [40]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 24028.572574591857
Mean Squared Error: 2496235671.4245434
Root Mean Squared Error: 49962.342533397525


In [41]:
test_pred = regressor.predict(test)

In [42]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': test_pred})
df.to_csv('linear-all-column-non-null-normalized.csv', index=False)  

## Outlier Eliminated Dataset

In [43]:
outlier_eliminated_regressor = LinearRegression()
outlier_eliminated_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

LinearRegression()

In [44]:
print(outlier_eliminated_regressor.intercept_)

115334.49927224556


In [45]:
print(outlier_eliminated_regressor.coef_)

[-1.88878629e+02 -1.29055536e+02  4.89124414e-01  1.77924902e+04
  4.12633462e+03  2.83944995e+02  1.10130540e+02  2.81961606e+01
  6.69679689e+00  1.61435201e+00 -1.72273732e+00  6.58841158e+00
  1.78614844e+01  1.49785493e+01 -9.76369189e+00  2.30763418e+01
  1.11185141e+04 -4.97119438e+02  4.75291906e+03  1.41434162e+03
 -9.76904999e+03 -1.64737166e+04  7.70906616e+03  2.94603811e+03
  1.16421775e+02  1.28541163e+04 -6.32048992e+00  2.55984652e+01
 -2.59490717e+00  1.86447181e+01  4.99337777e+01  7.63713407e+01
  8.21450350e+00 -1.28931539e+00 -1.38094197e+02 -5.85479683e+02]


In [46]:
y_outlier_eliminated_pred = outlier_eliminated_regressor.predict(X_outlier_eliminated_test)

In [47]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,193212.053863
1,87000,104321.489960
2,224000,216726.805901
3,158000,195238.667716
4,205000,226245.595308
...,...,...
275,146000,152794.308786
276,165600,178034.937189
277,234000,223895.554217
278,289000,263725.941165


In [48]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, y_outlier_eliminated_pred)))

Mean Absolute Error: 23762.844305497052
Mean Squared Error: 1226941775.6255224
Root Mean Squared Error: 35027.7286678072


In [49]:
test_pred = outlier_eliminated_regressor.predict(test)

In [50]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': test_pred})
df.to_csv('linear-all-column-non-null-outlier-eliminated.csv', index=False)  

## Encoded Dataset

In [51]:
encoded_regressor = LinearRegression()
encoded_regressor.fit(X_encoded_train, y_encoded_train)

LinearRegression()

In [52]:
print(encoded_regressor.intercept_)

427487.2388052783


In [53]:
print(encoded_regressor.coef_)

[-8.79067793e+01 -1.98189127e+02  3.59667460e+01  4.32777313e-01
  2.12324408e+04  1.77856375e+02  2.23419631e+03  3.18120862e+02
 -2.26456129e+04 -2.40051637e+02 -4.28490221e+03  7.49800759e+01
 -2.64977299e+03 -2.51253360e+04 -1.49796059e+03 -1.18671819e+03
  1.02343202e+04  5.39191785e+03  9.22993023e+01 -6.62325159e+00
  2.46079231e+03  2.30832668e+03 -8.73806594e+02  5.58734807e+02
  4.02439397e+03  4.03199089e+01 -1.02615556e+04  1.38198227e+03
  2.16574619e+03 -9.72294306e+03  2.00491415e+03 -5.11898758e+03
 -1.19989345e+03  1.25065038e+01  3.32467787e+02  7.85392519e+00
 -1.44187910e+00  1.89185499e+01 -1.26070603e+03 -2.06392111e+03
 -2.66224403e+03  3.09531766e+03  1.94142286e+01  1.99232451e+01
 -1.20691611e+01  2.72683127e+01  7.58516011e+02 -5.79480876e+01
  4.88379308e+03  6.82038183e+03 -6.31015926e+03 -1.70967908e+04
 -6.83987961e+03  3.75908151e+03 -3.63430949e+03  4.76019013e+03
 -7.50794634e+02  1.28339209e+03 -3.78800497e+01 -1.42796748e+03
 -1.05812815e+03  2.80850

In [54]:
y_encoded_pred = encoded_regressor.predict(X_encoded_test)

In [55]:
df = pd.DataFrame({'Actual': y_encoded_test, 'Predicted': y_encoded_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,233587.013774
1,133000,147638.218305
2,110000,98494.609688
3,192000,221324.638300
4,88000,107147.855460
...,...,...
287,324000,279512.776846
288,555000,429601.681027
289,136000,222378.857035
290,82500,53546.785431


In [56]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_encoded_test, y_encoded_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_encoded_test, y_encoded_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_encoded_test, y_encoded_pred)))

Mean Absolute Error: 22626.793021104906
Mean Squared Error: 2366034263.097895
Root Mean Squared Error: 48641.898226712896


In [57]:
test_pred = encoded_regressor.predict(encoded_test_dataset)

In [58]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': test_pred})
df.to_csv('linear-all-column-non-null-encoded.csv', index=False)  

# Random Forest Regression

## Normal Dataset

In [59]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

RandomForestRegressor()

In [60]:
rf_y_pred = rf_regressor.predict(X_test)

In [61]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': rf_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,228210.20
1,133000,152813.84
2,110000,104292.26
3,192000,225331.90
4,88000,87305.44
...,...,...
287,324000,306272.76
288,555000,446660.64
289,136000,171992.96
290,82500,75719.94


In [62]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_y_pred)))

Mean Absolute Error: 17962.66842465753
Mean Squared Error: 1121799418.366615
Root Mean Squared Error: 33493.274225829504


In [63]:
rf_test_pred = rf_regressor.predict(test)

In [64]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': rf_test_pred})
df.to_csv('random-forest-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [65]:
# No need for random forest

## Outlier Eliminated Dataset

In [66]:
outlier_eliminated_rf_regressor = RandomForestRegressor()
outlier_eliminated_rf_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

RandomForestRegressor()

In [67]:
y_outlier_eliminated_pred = outlier_eliminated_rf_regressor.predict(X_outlier_eliminated_test)

In [68]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,178396.32
1,87000,99494.12
2,224000,215173.87
3,158000,195179.24
4,205000,211289.19
...,...,...
275,146000,151155.87
276,165600,182096.65
277,234000,227385.89
278,289000,261639.62


In [69]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, y_outlier_eliminated_pred)))

Mean Absolute Error: 18663.783
Mean Squared Error: 734831957.0235686
Root Mean Squared Error: 27107.784067008663


In [70]:
rf_test_pred = outlier_eliminated_regressor.predict(test)

In [71]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': rf_test_pred})
df.to_csv('random-forest-all-column-non-null-outlier-eliminated.csv', index=False)  

## Encoded Dataset

In [72]:
rf_encoded_regressor = RandomForestRegressor()
rf_encoded_regressor.fit(X_encoded_train, y_encoded_train)

RandomForestRegressor()

In [73]:
rf_y_encoded_pred = rf_encoded_regressor.predict(X_encoded_test)

In [74]:
df = pd.DataFrame({'Actual': y_encoded_test, 'Predicted': rf_y_encoded_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,215102.50
1,133000,147624.50
2,110000,104995.74
3,192000,219606.20
4,88000,92233.00
...,...,...
287,324000,306341.57
288,555000,440990.07
289,136000,169370.63
290,82500,87050.00


In [75]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_encoded_test, rf_y_encoded_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_encoded_test, rf_y_encoded_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_encoded_test, rf_y_encoded_pred)))

Mean Absolute Error: 16729.834178082194
Mean Squared Error: 921125382.4523108
Root Mean Squared Error: 30350.047486821346


In [76]:
rf_test_encoded_pred = rf_encoded_regressor.predict(encoded_test_dataset)

In [77]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': rf_test_encoded_pred})
df.to_csv('random-forest-all-column-non-null-encoded.csv', index=False)  

## Encoded Outlier Eliminated Dataset

In [78]:
rf_p_regressor = RandomForestRegressor()
rf_p_regressor.fit(X_p_train, y_p_train)

RandomForestRegressor()

In [79]:
rf_y_p_pred = rf_p_regressor.predict(X_p_test)

In [80]:
df = pd.DataFrame({'Actual': y_p_test, 'Predicted': rf_y_p_pred})
df

Unnamed: 0,Actual,Predicted
0,260000,283654.30
1,250580,195067.91
2,223500,171195.10
3,235000,235961.42
4,201000,239535.23
...,...,...
110,90000,102028.43
111,252678,221196.92
112,122900,120469.25
113,177000,158605.25


In [81]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_p_test, rf_y_p_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_p_test, rf_y_p_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_p_test, rf_y_p_pred)))

Mean Absolute Error: 25293.392
Mean Squared Error: 2396266666.00325
Root Mean Squared Error: 48951.67684567353


In [82]:
rf_test_p_pred = rf_p_regressor.predict(encoded_test_dataset)

In [83]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': rf_test_p_pred})
df.to_csv('random-forest-all-column-non-null-encoded-outlier-eliminated.csv', index=False)  

# XGBoost

## Normal Dataset

In [84]:
# !pip3 install xgboost

In [85]:
import xgboost as xgb

xgboost = xgb.XGBRegressor()
xgboost.fit(X_train, y_train, verbose=False)
xg_y_pred = xgboost.predict(X_test)

In [86]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, xg_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, xg_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, xg_y_pred)))

Mean Absolute Error: 18176.17833636558
Mean Squared Error: 1169868260.7119222
Root Mean Squared Error: 34203.336982112174


In [87]:
xgb_test_pred = xgboost.predict(test)

In [88]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': xgb_test_pred})
df.to_csv('xgboost-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [89]:
# No need for XGBoost (Decision Tree)

## Outlier Eliminated Dataset

In [90]:
xgboost = xgb.XGBRegressor()
xgboost.fit(X_outlier_eliminated_train, y_outlier_eliminated_train, verbose=False)
xg_y_pred = xgboost.predict(X_outlier_eliminated_test)

In [91]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, xg_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, xg_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, xg_y_pred)))

Mean Absolute Error: 18816.78618861607
Mean Squared Error: 891954760.0328864
Root Mean Squared Error: 29865.611663464828


In [92]:
xgb_test_pred = xgboost.predict(test)

In [93]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': xgb_test_pred})
df.to_csv('xgboost-all-column-non-null-outlier-eliminated.csv', index=False)  

# Decision Tree Regression

## Normal Dataset

In [94]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [95]:
dt_y_pred = dt_regressor.predict(X_test)

In [96]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': dt_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,237500.0
1,133000,115000.0
2,110000,135000.0
3,192000,215000.0
4,88000,89500.0
...,...,...
287,324000,297000.0
288,555000,380000.0
289,136000,117500.0
290,82500,113000.0


In [97]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, dt_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_y_pred)))

Mean Absolute Error: 25226.11301369863
Mean Squared Error: 1423118948.4965754
Root Mean Squared Error: 37724.24881288659


In [98]:
dt_test_pred = dt_regressor.predict(test)

In [99]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': dt_test_pred})
df.to_csv('decision-tree-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [100]:
# No need for Decision Tree (Decision Tree)

## Outlier Eliminated Dataset

In [101]:
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

DecisionTreeRegressor()

In [102]:
dt_y_outlier_eliminated_pred = dt_regressor.predict(X_outlier_eliminated_test)

In [103]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': dt_y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,136500.0
1,87000,141000.0
2,224000,207500.0
3,158000,200000.0
4,205000,205000.0
...,...,...
275,146000,148500.0
276,165600,187500.0
277,234000,265000.0
278,289000,275000.0


In [104]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, dt_y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, dt_y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, dt_y_outlier_eliminated_pred)))

Mean Absolute Error: 26252.157142857144
Mean Squared Error: 1500498461.3428571
Root Mean Squared Error: 38736.26803581957


In [105]:
dt_outlier_eliminated_test_pred = dt_regressor.predict(test)

In [106]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': dt_outlier_eliminated_test_pred})
df.to_csv('decision-tree-all-column-non-null-outlier-eliminated.csv', index=False)  

# Extra Tree Regression

## Normal Dataset

In [107]:
from sklearn.tree import ExtraTreeRegressor
et_regressor = ExtraTreeRegressor()
et_regressor.fit(X_train, y_train)

ExtraTreeRegressor()

In [108]:
et_y_pred = et_regressor.predict(X_test)

In [109]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': et_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,256000.0
1,133000,153000.0
2,110000,137000.0
3,192000,226000.0
4,88000,84500.0
...,...,...
287,324000,286000.0
288,555000,501837.0
289,136000,256000.0
290,82500,55000.0


In [110]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, et_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, et_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, et_y_pred)))

Mean Absolute Error: 25218.88698630137
Mean Squared Error: 1564910098.8253424
Root Mean Squared Error: 39558.94461212713


In [111]:
et_test_pred = et_regressor.predict(test)

In [112]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': et_test_pred})
df.to_csv('extra-tree-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [113]:
# No need for Extra Tree

## Outlier Eliminated Dataset

In [114]:
from sklearn.tree import ExtraTreeRegressor
et_regressor = ExtraTreeRegressor()
et_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

ExtraTreeRegressor()

In [115]:
et_y_outlier_eliminated_pred = et_regressor.predict(X_outlier_eliminated_test)

In [116]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': et_y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,187000.0
1,87000,130000.0
2,224000,207500.0
3,158000,190000.0
4,205000,226000.0
...,...,...
275,146000,158000.0
276,165600,183200.0
277,234000,260000.0
278,289000,244000.0


In [117]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, et_y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, et_y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, et_y_outlier_eliminated_pred)))

Mean Absolute Error: 30032.332142857143
Mean Squared Error: 2904526331.082143
Root Mean Squared Error: 53893.65761462236


In [118]:
et_test_outlier_eliminated_pred = et_regressor.predict(test)

In [119]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': et_test_outlier_eliminated_pred})
df.to_csv('extra-tree-all-column-non-null-outlier-eliminated.csv', index=False)  

# Ada Boosting Regression

## Normal Dataset

In [120]:
from sklearn.ensemble import AdaBoostRegressor
ab_regressor = AdaBoostRegressor()
ab_regressor.fit(X_train, y_train)

AdaBoostRegressor()

In [121]:
ab_y_pred = ab_regressor.predict(X_test)

In [122]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': ab_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,265911.081633
1,133000,143962.321622
2,110000,124500.767857
3,192000,235387.097143
4,88000,124500.767857
...,...,...
287,324000,291199.569892
288,555000,453554.926893
289,136000,195610.496454
290,82500,116905.328947


In [123]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ab_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ab_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ab_y_pred)))

Mean Absolute Error: 23995.27342754723
Mean Squared Error: 1537497383.6179984
Root Mean Squared Error: 39210.93449049638


In [124]:
ab_test_pred = ab_regressor.predict(test)

In [125]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': ab_test_pred})
df.to_csv('ada-boosting-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [126]:
# No need for Ada Boosting

## Outlier Eliminated Dataset

In [127]:
from sklearn.ensemble import AdaBoostRegressor
ab_regressor = AdaBoostRegressor()
ab_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

AdaBoostRegressor()

In [128]:
ab_y_outlier_eliminated_pred = ab_regressor.predict(X_outlier_eliminated_test)

In [129]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': ab_y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,171471.663866
1,87000,118696.723636
2,224000,205583.033613
3,158000,185589.156334
4,205000,208284.240260
...,...,...
275,146000,155050.153374
276,165600,186149.537634
277,234000,215198.595308
278,289000,267472.839506


In [130]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, ab_y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, ab_y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, ab_y_outlier_eliminated_pred)))

Mean Absolute Error: 24351.662024593097
Mean Squared Error: 1141511306.639843
Root Mean Squared Error: 33786.25913947626


In [131]:
ab_test_outlier_eliminated_pred = ab_regressor.predict(test)

In [132]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': ab_test_outlier_eliminated_pred})
df.to_csv('ada-boosting-all-column-non-null-outlier-eliminated.csv', index=False)  

# Bagging Regression

## Normal Dataset

In [133]:
from sklearn.ensemble import BaggingRegressor
bg_regressor = BaggingRegressor()
bg_regressor.fit(X_train, y_train)

BaggingRegressor()

In [134]:
bg_y_pred = bg_regressor.predict(X_test)

In [135]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': bg_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,218140.0
1,133000,149550.0
2,110000,98568.3
3,192000,236179.0
4,88000,85430.0
...,...,...
287,324000,317890.2
288,555000,412053.1
289,136000,174083.4
290,82500,80810.0


In [136]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, bg_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, bg_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, bg_y_pred)))

Mean Absolute Error: 18105.838698630134
Mean Squared Error: 956439427.8616095
Root Mean Squared Error: 30926.354907450852


In [137]:
bg_test_pred = bg_regressor.predict(test)

In [138]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': bg_test_pred})
df.to_csv('bagging-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [139]:
# No need for Bagging

## Outlier Eliminated Dataset

In [140]:
from sklearn.ensemble import BaggingRegressor
bg_regressor = BaggingRegressor()
bg_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

BaggingRegressor()

In [141]:
bg_y_outlier_eliminated_pred = bg_regressor.predict(X_outlier_eliminated_test)

In [142]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': bg_y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,187430.0
1,87000,88340.0
2,224000,226022.0
3,158000,194840.0
4,205000,198365.0
...,...,...
275,146000,151850.0
276,165600,182190.0
277,234000,239990.0
278,289000,253400.0


In [143]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, bg_y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, bg_y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, bg_y_outlier_eliminated_pred)))

Mean Absolute Error: 19170.38464285714
Mean Squared Error: 888387209.7204642
Root Mean Squared Error: 29805.82509712597


In [144]:
bg_test_outlier_eliminated_pred = bg_regressor.predict(test)

In [145]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': bg_test_outlier_eliminated_pred})
df.to_csv('bagging-all-column-non-null-outlier-eliminated.csv', index=False)  

# Gradient Boosting Regression

## Normal Dataset

In [146]:
from sklearn.ensemble import GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor()
gb_regressor.fit(X_train, y_train)

GradientBoostingRegressor()

In [147]:
gb_y_pred = gb_regressor.predict(X_test)

In [148]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': gb_y_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,215237.003159
1,133000,152463.214610
2,110000,97671.225544
3,192000,209209.915473
4,88000,89928.015269
...,...,...
287,324000,277938.740741
288,555000,487685.823122
289,136000,162411.186422
290,82500,76445.140248


In [149]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, gb_y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, gb_y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, gb_y_pred)))

Mean Absolute Error: 16999.384668628616
Mean Squared Error: 767799231.670037
Root Mean Squared Error: 27709.19038279605


In [150]:
gb_test_pred = gb_regressor.predict(test)

In [151]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_pred})
df.to_csv('gradient-boosting-all-column-non-null.csv', index=False)  

## Normalized Dataset

In [152]:
# No need for Gradient Boosting

## Outlier Eliminated Dataset

In [153]:
gb_regressor = GradientBoostingRegressor()
gb_regressor.fit(X_outlier_eliminated_train, y_outlier_eliminated_train)

GradientBoostingRegressor()

In [154]:
gb_y_outlier_eliminated_pred = gb_regressor.predict(X_outlier_eliminated_test)

In [155]:
df = pd.DataFrame({'Actual': y_outlier_eliminated_test, 'Predicted': gb_y_outlier_eliminated_pred})
df

Unnamed: 0,Actual,Predicted
0,185000,173351.356406
1,87000,91721.616021
2,224000,211181.579697
3,158000,183645.352793
4,205000,205881.023214
...,...,...
275,146000,141829.310541
276,165600,180111.622330
277,234000,235552.568327
278,289000,254087.708048


In [156]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_outlier_eliminated_test, gb_y_outlier_eliminated_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_outlier_eliminated_test, gb_y_outlier_eliminated_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_outlier_eliminated_test, gb_y_outlier_eliminated_pred)))

Mean Absolute Error: 16951.06163901519
Mean Squared Error: 630998995.8969493
Root Mean Squared Error: 25119.69338779734


In [157]:
gb_test_outlier_eliminated_pred = gb_regressor.predict(test)

In [158]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_outlier_eliminated_pred})
df.to_csv('gradient-boosting-all-column-non-null-outlier-eliminated.csv', index=False)  

## Encoded Dataset

In [159]:
gb_regressor = GradientBoostingRegressor()
gb_regressor.fit(X_encoded_train, y_encoded_train)

GradientBoostingRegressor()

In [160]:
gb_y_encoded_pred = gb_regressor.predict(X_encoded_test)

In [161]:
df = pd.DataFrame({'Actual': y_encoded_test, 'Predicted': gb_y_encoded_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,224966.252556
1,133000,147737.962687
2,110000,108093.065922
3,192000,207174.762552
4,88000,90159.953254
...,...,...
287,324000,278159.344858
288,555000,467079.237805
289,136000,156990.334511
290,82500,78523.882206


In [162]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_encoded_test, gb_y_encoded_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_encoded_test, gb_y_encoded_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_encoded_test, gb_y_encoded_pred)))

Mean Absolute Error: 16667.51061104376
Mean Squared Error: 863212472.1148864
Root Mean Squared Error: 29380.47773803017


In [163]:
gb_test_encoded_pred = gb_regressor.predict(encoded_test_dataset)

In [164]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_encoded_pred})
df.to_csv('gradient-boosting-all-column-non-null-outlier-eliminated-encoded.csv', index=False)  

## Encoded Dataset -- Parameter Tuning

In [165]:
params = {'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 1000, 'subsample': 0.8}

gb_regressor = GradientBoostingRegressor(**params)
gb_regressor.fit(X_encoded_train, y_encoded_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=5, max_features='sqrt',
                          min_samples_split=10, n_estimators=1000,
                          subsample=0.8)

In [166]:
gb_y_encoded_pred = gb_regressor.predict(X_encoded_test)

In [167]:
df = pd.DataFrame({'Actual': y_encoded_test, 'Predicted': gb_y_encoded_pred})
df

Unnamed: 0,Actual,Predicted
0,200624,241012.992384
1,133000,144551.505268
2,110000,116730.647217
3,192000,201391.011933
4,88000,92921.399733
...,...,...
287,324000,290254.376220
288,555000,466917.400022
289,136000,179398.981791
290,82500,80717.869609


In [168]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_encoded_test, gb_y_encoded_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_encoded_test, gb_y_encoded_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_encoded_test, gb_y_encoded_pred)))

Mean Absolute Error: 15672.607302551824
Mean Squared Error: 862618779.4683863
Root Mean Squared Error: 29370.372477522076


In [169]:
gb_test_encoded_pred = gb_regressor.predict(encoded_test_dataset)

In [170]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_encoded_pred})
df.to_csv('gradient-boosting-all-column-non-null-encoded-tuned.csv', index=False)  

## Encoded Dataset -- Parameter Tuned & Outlier Eliminated

In [171]:
params = {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 5000, 'subsample': 0.9}

gb_regressor = GradientBoostingRegressor(**params)
gb_regressor.fit(X_p_train, y_p_train)

GradientBoostingRegressor(learning_rate=0.05, max_depth=5, max_features='sqrt',
                          min_samples_split=10, n_estimators=5000,
                          subsample=0.9)

In [172]:
gb_y_p_pred = gb_regressor.predict(X_p_test)

In [173]:
df = pd.DataFrame({'Actual': y_p_test, 'Predicted': gb_y_p_pred})
df

Unnamed: 0,Actual,Predicted
0,260000,312111.287722
1,250580,190912.526502
2,223500,193366.222416
3,235000,243624.646255
4,201000,238294.309372
...,...,...
110,90000,92017.143970
111,252678,234806.835188
112,122900,122380.538175
113,177000,172400.487412


In [174]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_p_test, gb_y_p_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_p_test, gb_y_p_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_p_test, gb_y_p_pred)))

Mean Absolute Error: 22406.197945832642
Mean Squared Error: 2526406564.5784006
Root Mean Squared Error: 50263.371997692324


In [175]:
gb_test_p_pred = gb_regressor.predict(encoded_test_dataset)

In [176]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_p_pred})
df.to_csv('gradient-boosting-all-column-non-null-outlier-eliminated-encoded-tuned-outlier-eliminated.csv', index=False)  

# Grid Searching for Gradient Boosting

In [383]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [0.001, 0.01, 0.05, 0.1],
    "min_samples_split": [5, 10, 30, 50],
    "max_depth": [3, 5, 7],
    "max_features": ["log2", "sqrt"],
    "subsample": [0.8, 0.9, 0.95, 1.0],
    "n_estimators": [500, 1000, 5000]
    }

clf = GridSearchCV(GradientBoostingRegressor(), parameters, cv=3, n_jobs=-1, verbose=5)

clf.fit(X_encoded_train, y_encoded_train)
print(clf.score(X_encoded_train, y_encoded_train))
print(clf.best_params_)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 3026 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed: 26.8min finished


0.9879487171100673
{'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 1000, 'subsample': 0.8}


# Grid Searching for Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = { 'bootstrap': [True, False],
              'max_depth': [50, 100, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2],
              'min_samples_split': [2, 5],
              'n_estimators': [100, 500, 10000]}

clf = GridSearchCV(RandomForestRegressor(), parameters, cv=3, n_jobs=-1, verbose=5)

clf.fit(X_encoded_train, y_encoded_train)
print(clf.score(X_encoded_train, y_encoded_train))
print(clf.best_params_)

# Grid Searching for Gradient Boosting -- Outlier eliminated

In [338]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "min_samples_split": [3, 5, 10],
    "max_depth": [3, 4, 5],
    "max_features": ["log2", "sqrt", 'auto'],
    "subsample": [0.9, 0.95, 1.0],
    "n_estimators": [100, 500, 1000, 5000]
    }

clf = GridSearchCV(GradientBoostingRegressor(), parameters, cv=5, n_jobs=-1, verbose=10)

clf.fit(X_p_train, y_p_train)
print(clf.score(X_p_train, y_p_train))
print(clf.best_params_)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

0.9978385371120169
{'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 5000, 'subsample': 1.0}


## Encoded Dataset -- Parameter Tuned & Outlier Eliminated

In [177]:
params = {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 5000, 'subsample': 1.0}

gb_regressor = GradientBoostingRegressor(**params)
gb_regressor.fit(X_p_train, y_p_train)

GradientBoostingRegressor(learning_rate=0.01, max_features='log2',
                          min_samples_split=10, n_estimators=5000)

In [178]:
gb_y_p_pred = gb_regressor.predict(X_p_test)

In [179]:
df = pd.DataFrame({'Actual': y_p_test, 'Predicted': gb_y_p_pred})
df

Unnamed: 0,Actual,Predicted
0,260000,306708.694682
1,250580,198339.247942
2,223500,201931.218859
3,235000,246412.133328
4,201000,233639.369053
...,...,...
110,90000,94330.629892
111,252678,243148.551642
112,122900,122252.679594
113,177000,184595.010480


In [180]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_p_test, gb_y_p_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_p_test, gb_y_p_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_p_test, gb_y_p_pred)))

Mean Absolute Error: 20890.710254953305
Mean Squared Error: 2539123558.878338
Root Mean Squared Error: 50389.71679696501


In [181]:
gb_test_p_pred = gb_regressor.predict(encoded_test_dataset)

In [182]:
df = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': gb_test_p_pred})
df.to_csv('gradient-boosting-all-column-non-null-outlier-eliminated-encoded-tuned-outlier-eliminated.csv', index=False)  