In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Here I am just loading the downloaded .csv files, but it may be better to load them directly from Kaggle.  
Thoughts?

In [2]:
# Load data sets
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")

print("Shape of training data set:", dfTrain.shape)
print("Shape of testing data set: ", dfTest.shape)

Shape of training data set: (1460, 81)
Shape of testing data set:  (1459, 80)


##### It might be a good idea to combine training and testing data sets for EDA purposes, in order to capture all of the odd cases.

In [3]:
# Combine data sets
dfBoth = pd.concat([dfTrain, dfTest], keys=['train', 'test'], names=['dataSet', 'index'])

print("Shape of combined data set:", dfBoth.shape)

Shape of combined data set: (2919, 81)


##### I think it's a good idea to first deal with *Null* values.

In [4]:
# Get a sorted list of the numbers of Null values
missingVals = dfBoth.isnull().sum()
missingVals = missingVals[missingVals > 0]
missingVals.sort_values()

Electrical         1
GarageArea         1
GarageCars         1
Exterior1st        1
Exterior2nd        1
KitchenQual        1
SaleType           1
TotalBsmtSF        1
BsmtFinSF1         1
BsmtUnfSF          1
BsmtFinSF2         1
Utilities          2
Functional         2
BsmtHalfBath       2
BsmtFullBath       2
MSZoning           4
MasVnrArea        23
MasVnrType        24
BsmtFinType1      79
BsmtFinType2      80
BsmtQual          81
BsmtExposure      82
BsmtCond          82
GarageType       157
GarageCond       159
GarageQual       159
GarageFinish     159
GarageYrBlt      159
LotFrontage      486
FireplaceQu     1420
SalePrice       1459
Fence           2348
Alley           2721
MiscFeature     2814
PoolQC          2909
dtype: int64

##### I was tempted to just remove any rows that have Null values in any of the columns with 5 or fewer Null values (Electrical through MSZoning), but I think we can make sense of some of them. For instance, the NaNs in `GarageArea` and `GarageCars` were probably just no garage and can reasonably be replaced with a zero.

In [5]:
# Replace null values with zero
dfBoth.GarageArea.fillna(0, inplace=True)
dfBoth.GarageCars.fillna(0, inplace=True)

In [6]:
# Get the counts for each garage size (in cars)
dfBoth.GarageCars.value_counts()

2.0    1594
1.0     776
3.0     374
0.0     158
4.0      16
5.0       1
Name: GarageCars, dtype: int64

##### Note the 157 zeroes (158 now) indicating no garage. The other Garage-type variables all have 157-159 Null values. We can set up a rule to change those NaNs to `None` for rows where `GarageCars == 0`. 

In [20]:
garageVars = ["GarageType", "GarageCond", "GarageQual", "GarageFinish", "GarageYrBlt"]
for i in garageVars:
    df.loc[df['GarageCars'] == 0.0, i] = 'None'

##### Similarly, Null values in Basement-related variables likely indicate no basement and can also be replaced with a zero.

In [7]:
# # Replace null values with zero
# dfBoth.TotalBsmtSF.fillna(0, inplace=True)
# dfBoth.BsmtFinSF1.fillna(0, inplace=True)
# dfBoth.BsmtFinSF2.fillna(0, inplace=True)
# dfBoth.BsmtUnfSF.fillna(0, inplace=True)
# dfBoth.BsmtHalfBath.fillna(0, inplace=True)
# dfBoth.BsmtFullBath.fillna(0, inplace=True)

In [8]:
# Replace null values with zero
bsmtVars = ["TotalBsmtSF", "BsmtFinSF1", "BsmtFinSF2", 
            "BsmtUnfSF", "BsmtHalfBath", "BsmtFullBath"]
for i in bsmtVars:
    dfBoth[i].fillna(0, inplace=True)

In [9]:
# Get the counts for each garage size (in cars)
dfBoth.BsmtFinType1.value_counts()

Unf    851
GLQ    849
ALQ    429
Rec    288
BLQ    269
LwQ    154
Name: BsmtFinType1, dtype: int64

In [11]:
df = dfBoth.copy()

In [13]:
df[df.GarageType.isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
dataSet,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,39,40,90,RL,65.0,6040,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,AdjLand,82000.0
train,48,49,190,RM,33.0,4456,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2009,New,Partial,113000.0
train,78,79,90,RL,72.0,10778,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,136500.0
train,88,89,50,C (all),105.0,8470,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,10,2009,ConLD,Abnorml,85000.0
train,89,90,20,RL,60.0,8070,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,123600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
test,1433,2894,50,C (all),60.0,8520,Grvl,,Reg,Bnk,AllPub,...,0,,,,0,4,2006,WD,Normal,
test,1449,2910,180,RM,21.0,1470,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,
test,1453,2914,160,RM,21.0,1526,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,6,2006,WD,Normal,
test,1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,


In [15]:
df.loc[df['GarageCars'] == 0.0, 'GarageType'] = 'None'

In [16]:
df.loc[df['GarageCars'] == 0.0, 'GarageType']

dataSet  index
train    39       None
         48       None
         78       None
         88       None
         89       None
                  ... 
test     1433     None
         1449     None
         1453     None
         1454     None
         1457     None
Name: GarageType, Length: 158, dtype: object

In [17]:
df[df.GarageType.isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
dataSet,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1


In [21]:
# Get a sorted list of the numbers of Null values
missingVals = dfBoth.isnull().sum()
missingVals = missingVals[missingVals > 0]
missingVals.sort_values()

Electrical         1
KitchenQual        1
Exterior1st        1
Exterior2nd        1
SaleType           1
Functional         2
Utilities          2
MSZoning           4
MasVnrArea        23
MasVnrType        24
BsmtFinType1      79
BsmtFinType2      80
BsmtQual          81
BsmtCond          82
BsmtExposure      82
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageQual       159
GarageCond       159
LotFrontage      486
FireplaceQu     1420
SalePrice       1459
Fence           2348
Alley           2721
MiscFeature     2814
PoolQC          2909
dtype: int64