## Handling import

In [1]:
import pandas as pd
import matplotlib as mpl
import seaborn as sebo
import numpy as numpy

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
cleaning_train_df = pd.read_csv('train.csv')

## Placing the dataframe into the notebook

In [3]:
cleaning_train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
cleaning_train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## This section is for the building quality and Structural integrity experiment

In [5]:
building_columns = [
    'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
    'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', 'CentralAir',
    'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
    'GarageArea', 'GarageQual', 'GarageCond', 'SalePrice'
]

In [6]:
building_struct_integ_df = cleaning_train_df[building_columns]

In [7]:
building_struct_integ_df.head()

Unnamed: 0,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,...,HeatingQC,CentralAir,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,SalePrice
0,7,5,2003,2003,Gd,TA,PConc,Gd,TA,No,...,Ex,Y,Attchd,2003.0,RFn,2,548,TA,TA,208500
1,6,8,1976,1976,TA,TA,CBlock,Gd,TA,Gd,...,Ex,Y,Attchd,1976.0,RFn,2,460,TA,TA,181500
2,7,5,2001,2002,Gd,TA,PConc,Gd,TA,Mn,...,Ex,Y,Attchd,2001.0,RFn,2,608,TA,TA,223500
3,7,5,1915,1970,TA,TA,BrkTil,TA,Gd,No,...,Gd,Y,Detchd,1998.0,Unf,3,642,TA,TA,140000
4,8,5,2000,2000,Gd,TA,PConc,Gd,TA,Av,...,Ex,Y,Attchd,2000.0,RFn,3,836,TA,TA,250000


In [8]:
building_struct_integ_df.isnull().sum()

OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual        37
BsmtCond        37
BsmtExposure    38
BsmtFinType1    37
BsmtFinSF1       0
BsmtFinType2    38
BsmtFinSF2       0
TotalBsmtSF      0
HeatingQC        0
CentralAir       0
GarageType      81
GarageYrBlt     81
GarageFinish    81
GarageCars       0
GarageArea       0
GarageQual      81
GarageCond      81
SalePrice        0
dtype: int64

In [9]:
building_struct_integ_df.describe()

Unnamed: 0,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,GarageYrBlt,GarageCars,GarageArea,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1379.0,1460.0,1460.0,1460.0
mean,6.099315,5.575342,1971.267808,1984.865753,443.639726,46.549315,1057.429452,1978.506164,1.767123,472.980137,180921.19589
std,1.382997,1.112799,30.202904,20.645407,456.098091,161.319273,438.705324,24.689725,0.747315,213.804841,79442.502883
min,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,1900.0,0.0,0.0,34900.0
25%,5.0,5.0,1954.0,1967.0,0.0,0.0,795.75,1961.0,1.0,334.5,129975.0
50%,6.0,5.0,1973.0,1994.0,383.5,0.0,991.5,1980.0,2.0,480.0,163000.0
75%,7.0,6.0,2000.0,2004.0,712.25,0.0,1298.25,2002.0,2.0,576.0,214000.0
max,10.0,9.0,2010.0,2010.0,5644.0,1474.0,6110.0,2010.0,4.0,1418.0,755000.0
