# Data Cleaning

In [99]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [100]:
df = pd.read_csv("train.csv")

In [101]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [102]:
df.shape

(1460, 81)

## Missing Values

In [103]:
df.isna().sum()[0:60]

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
Alley           1369
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         8
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinSF1         0
BsmtFinType2      38
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Heating            0
HeatingQC          0
CentralAir         0
Electrical         1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath 

In [104]:
df.isna().sum()[60:]

GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

## lotFrontage

This can not be missing since every house has some Linear feet of street connected to property, we'll replace it with mean

In [105]:
df['LotFrontage'] = df['LotFrontage'].replace(np.NaN, df['LotFrontage'].mean())

In [106]:
df['LotFrontage'].isna().sum()

0

## Alley

This should not be empty, it should have some since NA has been replaced with NaN

## BsmtQual

Evaluates the height of the basement, each basement should have a height

## Evaluates the general condition of the basement

This evaluates the general condition of the basement.

## BsmtExposure

Refers to walkout or garden level walls,

## BsmtFinType1

Rating of basement finished area, homes with no basement should have this values as NA not NaN

## Simplification

since most of the NaN columns have similar issues, its better to write a function for all columns with this NA

In [107]:
columns_NA = ['BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
              'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 
              'BsmtFinType1', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 'Alley']

for col in columns_NA:
    df[col] = df[col].replace(np.NaN, "absent")

In [108]:
df.isna().sum()[0:60]

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       8
MasVnrArea       8
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       1
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr

In [109]:
df.isna().sum()[60:]

GarageFinish     0
GarageCars       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
PoolQC           0
Fence            0
MiscFeature      0
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
dtype: int64

## GarageYrBlt

We need to replace it with the average year, since its each garage has a year in which it was built it can not be empty, it was not recorded

In [110]:
df['GarageYrBlt'] = df['GarageYrBlt'].replace(np.NaN, df['GarageYrBlt'].mean())

## Electrical

For electrical we'll just fill it with the next value in the column

In [111]:
df['Electrical'] = df['Electrical'].fillna(method = 'bfill', axis = 0).fillna(0)

## MasVnrType and MasVnrArea

We'll just replace it with the value before it.


In [112]:
df['MasVnrType'] = df['MasVnrType'].fillna(method = 'bfill', axis = 0).fillna(0)
df['MasVnrArea'] = df['MasVnrArea'].fillna(method = 'bfill', axis = 0).fillna(0)

In [113]:
df.isna().sum()[0:60]

Id              0
MSSubClass      0
MSZoning        0
LotFrontage     0
LotArea         0
Street          0
Alley           0
LotShape        0
LandContour     0
Utilities       0
LotConfig       0
LandSlope       0
Neighborhood    0
Condition1      0
Condition2      0
BldgType        0
HouseStyle      0
OverallQual     0
OverallCond     0
YearBuilt       0
YearRemodAdd    0
RoofStyle       0
RoofMatl        0
Exterior1st     0
Exterior2nd     0
MasVnrType      0
MasVnrArea      0
ExterQual       0
ExterCond       0
Foundation      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinSF1      0
BsmtFinType2    0
BsmtFinSF2      0
BsmtUnfSF       0
TotalBsmtSF     0
Heating         0
HeatingQC       0
CentralAir      0
Electrical      0
1stFlrSF        0
2ndFlrSF        0
LowQualFinSF    0
GrLivArea       0
BsmtFullBath    0
BsmtHalfBath    0
FullBath        0
HalfBath        0
BedroomAbvGr    0
KitchenAbvGr    0
KitchenQual     0
TotRmsAbvGrd    0
Functional

In [114]:
df.isna().sum()[60:]

GarageFinish     0
GarageCars       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
PoolQC           0
Fence            0
MiscFeature      0
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
dtype: int64

In [115]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,absent,Reg,Lvl,AllPub,...,0,absent,absent,absent,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,absent,Reg,Lvl,AllPub,...,0,absent,absent,absent,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,absent,IR1,Lvl,AllPub,...,0,absent,absent,absent,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,absent,IR1,Lvl,AllPub,...,0,absent,absent,absent,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,absent,IR1,Lvl,AllPub,...,0,absent,absent,absent,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,absent,Reg,Lvl,AllPub,...,0,absent,absent,absent,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,absent,Reg,Lvl,AllPub,...,0,absent,MnPrv,absent,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,absent,Reg,Lvl,AllPub,...,0,absent,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,absent,Reg,Lvl,AllPub,...,0,absent,absent,absent,0,4,2010,WD,Normal,142125


## Export file

In [116]:
df.to_csv("train_cleaned.csv", encoding = 'utf-8')