# Project 2 - Ames Housing Data and Kaggle Challenge - Cleaning
## Matt Reed / DSI-124


In [572]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [573]:
def dummy_corr(column_name, dataframe, target_name):
    return dataframe[column_name].str.get_dummies().join(dataframe[target_name]).corr()[target_name].sort_values(ascending=False)[1:]

In [574]:
# Approach found at https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
def flatten(t):
    return [item for sublist in t for item in sublist]

In [575]:
df_train = pd.read_csv('../datasets/train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [576]:
# MS SubClass uses numbers for labels; should be treated as str
df_train['MS SubClass'] = df_train['MS SubClass'].astype(str)

In [577]:
# Looking at null values
df_train.isnull().sum().sort_values(ascending=False)

Pool QC         2042
Misc Feature    1986
Alley           1911
Fence           1651
Fireplace Qu    1000
                ... 
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
SalePrice          0
Length: 81, dtype: int64

In [578]:
# Realized that Year/Month Sold was obfuscated by being treated as an integer
df_train['Yr Sold'] = df_train['Yr Sold'].astype(str)
df_train['Mo Sold'] = df_train['Mo Sold'].astype(str)

In [579]:
df_train.select_dtypes(include=['object']).describe().transpose()

Unnamed: 0,count,unique,top,freq
MS SubClass,2051,16,20,770
MS Zoning,2051,7,RL,1598
Street,2051,2,Pave,2044
Alley,140,2,Grvl,85
Lot Shape,2051,4,Reg,1295
Land Contour,2051,4,Lvl,1843
Utilities,2051,3,AllPub,2049
Lot Config,2051,5,Inside,1503
Land Slope,2051,3,Gtl,1953
Neighborhood,2051,28,NAmes,310


### Section Added Following Model Performance Analysis
##### Inspecting Outliers prior to cleaning

In [606]:
outliers = df_train.iloc[[1022, 1198, 607, 342, 1548, 1587, 1134, 196],:]

In [607]:
low_resid = df_train.iloc[[1980, 353, 824, 874, 462, 1040, 1619, 1202, 300, 1433],:]

In [608]:
outlier_data = outliers.append(low_resid)

In [609]:
pd.set_option("display.max_columns", None)

In [610]:
pd.DataFrame(outlier_data, columns=df_train.columns)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
1022,2703,904301410,30,RL,55.0,8250,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1Story,5,7,1935,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,Slab,,,,,0.0,,0.0,0.0,0.0,GasA,TA,N,SBrkr,1032,0,0,1032,0.0,0.0,1,0,2,1,TA,6,Typ,1,TA,Detchd,1939.0,Unf,1.0,260.0,TA,TA,Y,0,0,121,0,0,0,,,,0,6,2006,WD,125000
1198,2079,905426010,20,RL,84.0,12615,Pave,,Reg,Lvl,AllPub,Corner,Gtl,Edwards,Norm,Norm,1Fam,1Story,6,7,1950,2001,Gable,CompShg,WdShing,Wd Shng,,0.0,TA,TA,CBlock,TA,Gd,Av,ALQ,477.0,Unf,0.0,725.0,1202.0,GasA,TA,Y,SBrkr,2158,0,0,2158,1.0,0.0,2,0,4,1,Gd,7,Typ,1,Gd,Attchd,1950.0,Unf,2.0,576.0,TA,TA,Y,0,29,39,0,0,0,,MnPrv,,0,6,2007,WD,243000
607,1473,907405020,20,RL,70.0,9135,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,1Story,7,5,2002,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,113.0,Gd,TA,PConc,Gd,TA,Av,GLQ,810.0,Unf,0.0,726.0,1536.0,GasA,Ex,Y,SBrkr,1536,0,0,1536,1.0,0.0,2,0,3,1,Gd,7,Typ,0,,Attchd,2002.0,RFn,2.0,532.0,TA,TA,Y,192,74,0,0,0,0,,,,0,12,2008,WD,214000
342,1045,527451400,160,RM,21.0,1680,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,TwnhsE,2Story,6,3,1971,1971,Gable,CompShg,HdBoard,HdBoard,BrkFace,604.0,TA,TA,CBlock,TA,TA,No,ALQ,358.0,Unf,0.0,125.0,483.0,GasA,TA,Y,SBrkr,483,504,0,987,0.0,0.0,1,1,2,1,TA,5,Typ,0,,Detchd,1971.0,Unf,1.0,264.0,TA,TA,Y,0,0,0,0,0,0,,,,0,8,2008,WD,89500
1548,2227,909452102,20,RL,,17871,Pave,,IR2,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,1Story,4,5,1995,1996,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,1680.0,1680.0,GasA,Gd,Y,SBrkr,1680,0,0,1680,0.0,0.0,2,0,4,1,Gd,7,Typ,0,,Attchd,1996.0,Unf,2.0,628.0,TA,TA,Y,152,0,0,0,0,0,,,,0,6,2007,WD,170000
1587,113,534152050,20,RL,,10603,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,7,1977,2001,Gable,CompShg,Plywood,Plywood,BrkFace,28.0,TA,TA,PConc,TA,TA,Mn,ALQ,1200.0,Unf,0.0,410.0,1610.0,GasA,Gd,Y,SBrkr,1610,0,0,1610,1.0,0.0,2,0,3,1,Gd,6,Typ,2,TA,Attchd,1977.0,RFn,2.0,480.0,TA,TA,Y,168,68,0,0,0,0,,,,0,2,2010,WD,205000
1134,1998,902330090,75,RM,75.0,13500,Pave,Grvl,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,Norm,1Fam,2.5Unf,7,8,1879,1987,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,Gd,PConc,TA,TA,No,Unf,0.0,Unf,0.0,819.0,819.0,GasA,TA,Y,FuseA,1312,1142,0,2454,0.0,0.0,2,0,3,1,TA,8,Typ,1,Gd,Attchd,1950.0,Unf,2.0,576.0,TA,TA,N,0,148,150,0,0,0,,MnPrv,,0,2,2007,WD,185000
196,543,531376090,20,RL,61.0,7328,Pave,,Reg,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,1Fam,1Story,7,5,2008,2009,Gable,CompShg,VinylSd,VinylSd,BrkFace,140.0,Gd,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,1450.0,1450.0,GasA,Ex,Y,SBrkr,1450,0,0,1450,0.0,0.0,2,0,2,1,Gd,6,Typ,0,,Attchd,2008.0,RFn,3.0,788.0,TA,TA,Y,0,93,0,0,0,0,,,,0,2,2009,New,224243
1980,477,528235200,80,RL,59.0,9434,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,SLvl,7,5,2004,2005,Gable,CompShg,WdShing,Wd Shng,,0.0,Gd,TA,PConc,Gd,TA,Mn,Unf,0.0,Unf,0.0,384.0,384.0,GasA,Ex,Y,SBrkr,744,630,0,1374,0.0,0.0,2,1,3,1,Gd,6,Typ,1,Gd,BuiltIn,2004.0,Fin,2.0,400.0,TA,TA,Y,100,0,0,0,0,0,,,,0,8,2009,WD,170000
353,158,535353190,50,RL,78.0,17503,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1.5Fin,6,5,1948,1950,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,912.0,912.0,GasA,TA,Y,SBrkr,912,546,0,1458,0.0,1.0,1,0,3,1,TA,6,Typ,1,Gd,Attchd,1948.0,Unf,1.0,330.0,TA,TA,Y,192,0,0,0,0,0,,,,0,1,2010,WD,97500


In [533]:
df_train['Yr Sold'].describe()

count     2051
unique       5
top       2007
freq       498
Name: Yr Sold, dtype: object

In [534]:
dummy_corr('MS SubClass', df_train, 'SalePrice').max()

0.3554214620632621

In [535]:
df_train.select_dtypes(include=['object']).columns

Index(['MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Lot Shape',
       'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood',
       'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Mo Sold', 'Yr Sold', 'Sale Type'],
      dtype='object')

In [536]:
# Creating a list of the highest correlations based on dummifying categorical variables in my dataframe
categorical_corr = [(column_name, 
                     dummy_corr(column_name, df_train, 'SalePrice').max(), 
                     dummy_corr(column_name, df_train, 'SalePrice').min(), 
                     dummy_corr(column_name, df_train, 'SalePrice').max() - dummy_corr(column_name, df_train, 'SalePrice').min()) 
 for column_name 
 in df_train.select_dtypes(include=['object']).columns]

In [537]:
# Sort approach found on https://pythonguides.com/python-sort-list-of-tuples/
cat_corr_diff = categorical_corr.copy()

cat_corr_diff.sort(reverse=True, key= lambda x: x[1])
cat_corr_diff[:]

[('Bsmt Qual', 0.586497229460346, -0.45697898944878185, 1.0434762189091278),
 ('Kitchen Qual', 0.5512844949973896, -0.540860057110634, 1.0921445521080235),
 ('Foundation', 0.5290468529844157, -0.35553521520011966, 0.8845820681845353),
 ('Exter Qual', 0.49386101668262256, -0.6003620438785177, 1.0942230605611403),
 ('BsmtFin Type 1',
  0.4635488990371277,
  -0.15091467927234825,
  0.614463578309476),
 ('Heating QC', 0.4532553529286872, -0.3435415579170743, 0.7967969108457615),
 ('Neighborhood', 0.4486468134029907, -0.2083710127541118, 0.6570178261571025),
 ('Garage Finish',
  0.4229363608708119,
  -0.43222009547586315,
  0.855156456346675),
 ('Fireplace Qu',
  0.38473214276053636,
  -0.06598356800263537,
  0.45071571076317174),
 ('Bsmt Exposure',
  0.3770317587162424,
  -0.2931059611409868,
  0.6701377198572291),
 ('Sale Type', 0.35810196911735687, -0.21254174769346018, 0.570643716810817),
 ('Garage Type', 0.35787902815727923, -0.3703444386576489, 0.7282234668149281),
 ('MS SubClass', 0.

In [538]:
cat_corr_diff.sort(reverse=False, key= lambda x: x[2])
cat_corr_diff[:]

[('Exter Qual', 0.49386101668262256, -0.6003620438785177, 1.0942230605611403),
 ('Kitchen Qual', 0.5512844949973896, -0.540860057110634, 1.0921445521080235),
 ('Bsmt Qual', 0.586497229460346, -0.45697898944878185, 1.0434762189091278),
 ('Garage Finish',
  0.4229363608708119,
  -0.43222009547586315,
  0.855156456346675),
 ('Mas Vnr Type', 0.31002603603789, -0.4240875482859905, 0.7341135843238804),
 ('Garage Type', 0.35787902815727923, -0.3703444386576489, 0.7282234668149281),
 ('Foundation', 0.5290468529844157, -0.35553521520011966, 0.8845820681845353),
 ('Heating QC', 0.4532553529286872, -0.3435415579170743, 0.7967969108457615),
 ('Lot Shape', 0.2735744444208477, -0.306290826001016, 0.5798652704218636),
 ('Bsmt Exposure',
  0.3770317587162424,
  -0.2931059611409868,
  0.6701377198572291),
 ('MS Zoning', 0.23146818258809063, -0.2819514318924289, 0.5134196144805195),
 ('Central Air',
  0.27737780614516555,
  -0.27737780614516583,
  0.5547556122903314),
 ('Paved Drive', 0.2892096014539491

In [539]:
corr_price = df_train.corr()['SalePrice']
corr_price_sorted = corr_price[:-1].sort_values(ascending=False)
corr_price_sorted

Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.533922
Mas Vnr Area       0.512230
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423519
Lot Frontage       0.341842
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Area           0.296566
Bsmt Full Bath     0.283662
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190210
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Pool Area          0.023106
BsmtFin SF 2       0.016255
Misc Val          -0.007375
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045328
Id                -0.051398
Overall Cond      -0.097019
Kitchen AbvGr     -0.125444
Enclosed Porch    -0.135656
PID               -0.255052
Name: SalePrice, dty

Features of Interest:

In [540]:
corr_top = list(corr_price_sorted[corr_price_sorted > .5].index.values)
corr_top

['Overall Qual',
 'Gr Liv Area',
 'Garage Area',
 'Garage Cars',
 'Total Bsmt SF',
 '1st Flr SF',
 'Year Built',
 'Year Remod/Add',
 'Full Bath',
 'Garage Yr Blt',
 'Mas Vnr Area',
 'TotRms AbvGrd']

In [541]:
cat_corr_top = [variable for variable, high, low, diff in categorical_corr if (high or abs(low)) > .3]
cat_corr_top

['MS SubClass',
 'Neighborhood',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Exter Qual',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'Heating QC',
 'Kitchen Qual',
 'Fireplace Qu',
 'Garage Type',
 'Garage Finish',
 'Sale Type']

In [542]:
list_of_lists = [['Id'], corr_top, cat_corr_top, ['Mo Sold', 'Yr Sold'], ['SalePrice']]
columns_of_interest = flatten(list_of_lists)

In [543]:
columns_of_interest

['Id',
 'Overall Qual',
 'Gr Liv Area',
 'Garage Area',
 'Garage Cars',
 'Total Bsmt SF',
 '1st Flr SF',
 'Year Built',
 'Year Remod/Add',
 'Full Bath',
 'Garage Yr Blt',
 'Mas Vnr Area',
 'TotRms AbvGrd',
 'MS SubClass',
 'Neighborhood',
 'Exterior 1st',
 'Exterior 2nd',
 'Mas Vnr Type',
 'Exter Qual',
 'Foundation',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'Heating QC',
 'Kitchen Qual',
 'Fireplace Qu',
 'Garage Type',
 'Garage Finish',
 'Sale Type',
 'Mo Sold',
 'Yr Sold',
 'SalePrice']

In [544]:
df_reduced = df_train[columns_of_interest]
df_reduced.head()

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,BsmtFin Type 1,Heating QC,Kitchen Qual,Fireplace Qu,Garage Type,Garage Finish,Sale Type,Mo Sold,Yr Sold,SalePrice
0,109,6,1479,475.0,2.0,725.0,725,1976,2005,2,...,GLQ,Ex,Gd,,Attchd,RFn,WD,3,2010,130500
1,544,7,2122,559.0,2.0,913.0,913,1996,1997,2,...,GLQ,Ex,Gd,TA,Attchd,RFn,WD,4,2009,220000
2,153,5,1057,246.0,1.0,1057.0,1057,1953,2007,1,...,GLQ,TA,Gd,,Detchd,Unf,WD,1,2010,109000
3,318,5,1444,400.0,2.0,384.0,744,2006,2007,2,...,Unf,Gd,TA,,BuiltIn,Fin,WD,4,2010,174000
4,255,6,1445,484.0,2.0,676.0,831,1900,1993,2,...,Unf,TA,TA,,Detchd,Unf,WD,3,2010,138500


In [545]:
df_kaggle = pd.read_csv('../datasets/test.csv')
df_kaggle_reduced = df_test[columns_of_interest[:-1]]
df_kaggle_reduced.head()

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Bsmt Exposure,BsmtFin Type 1,Heating QC,Kitchen Qual,Fireplace Qu,Garage Type,Garage Finish,Sale Type,Mo Sold,Yr Sold
0,2658,6,1928,440,1,1020,908,1910,1950,2,...,No,Unf,Gd,Fa,,Detchd,Unf,WD,4,2006
1,2718,5,1967,580,2,1967,1967,1977,1977,2,...,No,Unf,TA,TA,,Attchd,Fin,WD,8,2006
2,2414,7,1496,426,2,654,664,2006,2006,2,...,Av,GLQ,Ex,Gd,Gd,Attchd,RFn,New,9,2006
3,1989,5,968,480,2,968,968,1923,2006,1,...,No,Unf,TA,TA,,Detchd,Unf,WD,7,2007
4,625,6,1394,514,2,1394,1394,1963,1963,1,...,No,BLQ,Gd,TA,Gd,Attchd,RFn,WD,7,2009


In [546]:
dummy_columns = df_reduced[columns_of_interest].select_dtypes(include=['object']).columns

In [547]:
# Dummify categorical variables
df_reduced = pd.get_dummies(df_reduced, columns=dummy_columns, drop_first=True, )

In [548]:
# Dummify test dataframe identically to main dataframe
df_test_reduced = pd.get_dummies(df_test_reduced, columns=dummy_columns, drop_first=True)

In [549]:
df_reduced.head()

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Mo Sold_4,Mo Sold_5,Mo Sold_6,Mo Sold_7,Mo Sold_8,Mo Sold_9,Yr Sold_2007,Yr Sold_2008,Yr Sold_2009,Yr Sold_2010
0,109,6,1479,475.0,2.0,725.0,725,1976,2005,2,...,0,0,0,0,0,0,0,0,0,1
1,544,7,2122,559.0,2.0,913.0,913,1996,1997,2,...,1,0,0,0,0,0,0,0,1,0
2,153,5,1057,246.0,1.0,1057.0,1057,1953,2007,1,...,0,0,0,0,0,0,0,0,0,1
3,318,5,1444,400.0,2.0,384.0,744,2006,2007,2,...,1,0,0,0,0,0,0,0,0,1
4,255,6,1445,484.0,2.0,676.0,831,1900,1993,2,...,0,0,0,0,0,0,0,0,0,1


In [550]:
# Approach found at https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data

# Get missing columns in the training test
missing_cols = set(df_reduced.columns) - set(df_test_reduced.columns)
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    df_test_reduced[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
df_test_reduced = df_test_reduced[df_reduced.columns]

In [551]:
df_test_reduced

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Mo Sold_4,Mo Sold_5,Mo Sold_6,Mo Sold_7,Mo Sold_8,Mo Sold_9,Yr Sold_2007,Yr Sold_2008,Yr Sold_2009,Yr Sold_2010
0,2658,6,1928,440,1,1020,908,1910,1950,2,...,1,0,0,0,0,0,0,0,0,0
1,2718,5,1967,580,2,1967,1967,1977,1977,2,...,0,0,0,0,1,0,0,0,0,0
2,2414,7,1496,426,2,654,664,2006,2006,2,...,0,0,0,0,0,1,0,0,0,0
3,1989,5,968,480,2,968,968,1923,2006,1,...,0,0,0,1,0,0,1,0,0,0
4,625,6,1394,514,2,1394,1394,1963,1963,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,6,1877,488,2,1084,1084,1974,1974,2,...,0,0,0,0,0,0,1,0,0,0
874,1234,6,1988,480,2,1104,1104,1966,1999,2,...,0,0,0,0,1,0,0,1,0,0
875,1373,5,1211,322,1,952,1211,1968,1968,1,...,0,0,0,0,1,0,0,1,0,0
876,1672,4,864,528,2,864,864,1971,1971,1,...,0,1,0,0,0,0,1,0,0,0


In [555]:
df_reduced.isnull().sum().sort_values(ascending=False)

Garage Yr Blt           114
Mas Vnr Area             22
Garage Area               1
Garage Cars               1
Total Bsmt SF             1
                       ... 
Neighborhood_StoneBr      0
Neighborhood_Timber       0
Neighborhood_Veenker      0
Exterior 1st_AsphShn      0
Yr Sold_2010              0
Length: 148, dtype: int64

In [558]:
df_reduced[df_reduced['Garage Yr Blt'].isnull()]

Unnamed: 0,Id,Overall Qual,Gr Liv Area,Garage Area,Garage Cars,Total Bsmt SF,1st Flr SF,Year Built,Year Remod/Add,Full Bath,...,Mo Sold_4,Mo Sold_5,Mo Sold_6,Mo Sold_7,Mo Sold_8,Mo Sold_9,Yr Sold_2007,Yr Sold_2008,Yr Sold_2009,Yr Sold_2010
28,2243,5,1991,0.0,0.0,957.0,1034,1895,2006,2,...,0,0,1,0,0,0,1,0,0,0
53,330,4,1092,0.0,0.0,546.0,546,1970,1970,1,...,0,0,0,0,0,0,0,0,0,1
65,2278,5,1120,0.0,0.0,1120.0,1120,2007,2007,1,...,0,0,0,0,0,0,1,0,0,0
79,2235,5,1601,0.0,0.0,936.0,936,1925,2003,2,...,0,0,0,0,0,1,1,0,0,0
101,2084,4,605,0.0,0.0,528.0,605,1920,2002,1,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,325,6,1824,0.0,0.0,912.0,912,1971,1971,2,...,1,0,0,0,0,0,0,0,0,1
2010,2880,3,729,0.0,0.0,0.0,729,1945,1950,1,...,0,0,0,0,0,0,0,0,0,0
2027,2628,5,1556,0.0,0.0,1556.0,1556,1960,1960,2,...,0,0,0,0,0,1,0,0,0,0
2039,2288,4,1092,0.0,0.0,546.0,546,1970,1970,1,...,0,1,0,0,0,0,1,0,0,0


In [562]:
# From https://stackoverflow.com/questions/35530640/pandas-use-value-if-not-null-else-use-value-from-next-column
df_reduced['Garage Yr Blt'] = df_reduced['Garage Yr Blt'].fillna(df_reduced['Year Built'])

In [565]:
df_reduced.isnull().sum().sort_values(ascending=False)

Mas Vnr Area            22
Garage Area              1
Garage Cars              1
Total Bsmt SF            1
BsmtFin Type 1_GLQ       0
                        ..
Neighborhood_Somerst     0
Neighborhood_StoneBr     0
Neighborhood_Timber      0
Neighborhood_Veenker     0
Yr Sold_2010             0
Length: 148, dtype: int64

In [566]:
# Relatively small number of null values
df_reduced.isnull().sum().sum()/len(df_reduced.index)

0.01218917601170161

In [567]:
df_reduced.dropna(inplace=True)

In [568]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2027 entries, 0 to 2050
Columns: 148 entries, Id to Yr Sold_2010
dtypes: float64(5), int64(9), uint8(134)
memory usage: 502.8 KB


In [569]:
df_reduced.to_csv('../datasets/filtered_dataset.csv', index=False)

In [570]:
df_test_reduced.to_csv('../datasets/filtered_test_dataset.csv', index=False)