In [1]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

In [91]:
#Read in data
pd.set_option('display.max_rows', None)

ames = pd.read_csv('../data/train.csv')

In [92]:
#View df
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [124]:
#Look for null values and drop or change values that are Nan
#Going through the entire list 
ames.isnull().sum()

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Alley              0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
BsmtFin SF 2       0
Bsmt Unf SF        0
Total Bsmt SF      0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area  

In [114]:
#Dealing with lot frontage null values.
#Most of the values are non-nan and the range for area is not that large, so I will fill in null values with mean
ames['Lot Frontage'].head(200)
lf_mean = ames['Lot Frontage'].mean()
ames['Lot Frontage']= ames['Lot Frontage'].fillna(round(lf_mean))

In [94]:
#Dealing with alley
#Since many homes do not have alleys next to them,I decided just to drop
ames['Alley'] = ames['Alley'].fillna('None')

In [95]:
#Mas Vnr type
ames['Mas Vnr Type'].head(100)
#Going to convert Nan to none
ames['Mas Vnr Type'] = ames['Mas Vnr Type'].fillna('None')


In [96]:
#Mas Vnr Area
ames['Mas Vnr Area'].head(100)
#Convert Nan to 0 since I converted the type to none
ames['Mas Vnr Area']=ames['Mas Vnr Area'].fillna(0)

In [97]:
#BsmtQual null values
#The NA = no basement so I will change to none and map later
ames['Bsmt Qual'].head(100)
ames['Bsmt Qual'] = ames['Bsmt Qual'].fillna("None")

In [98]:
#Bsmt Cond null values
#same count as b_qual null so I will change to none and map late
ames['Bsmt Cond'] = ames['Bsmt Cond'].fillna('None') 

In [99]:
#Bsmt Exposure null values
ames['Bsmt Exposure'].head(100)
#will change nan to no's
ames['Bsmt Exposure'] =ames['Bsmt Exposure'].fillna('No')

In [100]:
#Bsmnt finish type 1
ames['BsmtFin Type 1'].head(100)
#Going to change finish to unf since technically there is no finish
ames['BsmtFin Type 1'] =ames['BsmtFin Type 1'].fillna("Unf")  


In [101]:
#Bsmnt finish type 2
ames['BsmtFin Type 2'].head(10)
#Going to change finish to unf. same as above
ames['BsmtFin Type 2'] =ames['BsmtFin Type 2'].fillna("Unf")

In [102]:
#BsmtFin null. will convert to 0
ames['BsmtFin SF 1'].head()
ames['BsmtFin SF 1']= ames['BsmtFin SF 1'].fillna(0) 


In [103]:
#BsmtFin SF 2
ames['BsmtFin SF 2'].head()
ames['BsmtFin SF 2'] = ames['BsmtFin SF 2'].fillna(0)

In [104]:
#Bsmt Unf SF
ames['Bsmt Unf SF'].head(100)
ames['Bsmt Unf SF'] = ames['Bsmt Unf SF'].fillna(0)

In [105]:
#Total Bsmt SF
ames['Total Bsmt SF'].head()
#Will convert NA to 0
ames['Total Bsmt SF'] = ames['Total Bsmt SF'].fillna(0)

In [106]:
#Bsmt Full Bath
ames['Bsmt Full Bath'].head()
#Will convert NA to 0
ames['Bsmt Full Bath'] = ames['Bsmt Full Bath'].fillna(0)

In [107]:
#Bsmt Half Bath 
ames['Bsmt Half Bath'].head()
#Will convert NA to 0
ames['Bsmt Half Bath'] = ames['Bsmt Half Bath'].fillna(0)

In [108]:
#Skippung to last of the smaller nulls
#Since it is car count and area I will convert both to 0.
#No garage
ames['Garage Cars'] = ames['Garage Cars'].fillna(0)           
ames['Garage Area'] = ames['Garage Area'].fillna(0)   

In [109]:
#Fireplaces were marked as Nan if there was no fp..
#I will convert to No for now and map later
ames['Fireplace Qu'] =  ames['Fireplace Qu'].fillna('No') 

In [110]:
#Pool qc is the same as the above. Nan means no. Will convert to no
ames['Pool QC'] = ames['Pool QC'].fillna('No')

In [111]:
#Fence
ames['Fence'] = ames['Fence'].fillna('No')

In [112]:
#Misc Features
ames['Misc Feature'] = ames['Misc Feature'].fillna('No')

In [121]:
#Garage Nulls. All of the null values = that there is no garage. 
#Will convert all ordinals to None and continuous/discrete to 0
ames['Garage Finish'] = ames['Garage Finish'].fillna('None')

ames['Garage Qual'] = ames['Garage Finish'].fillna('None')

ames['Garage Cond'] = ames['Garage Cond'].fillna('None')

ames['Garage Type'] = ames['Garage Type'].fillna('None')

ames['Garage Cars'] = ames['Garage Cars'].fillna(0)

ames['Garage Area'] = ames['Garage Area'].fillna(0)

# Will use the mean of year built to fill in columns instead of dropping
y_built = ames['Garage Yr Blt'].mean()
ames['Garage Yr Blt'] = ames['Garage Yr Blt'].fillna(round(y_built))



In [17]:
ames.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Area', 'Street',
       'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond',
       'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Garage Type',
       'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area',
       'Garage Qual

In [125]:
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,69.0,13517,Pave,,IR1,Lvl,...,0,0,No,No,No,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,No,No,No,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,No,No,No,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,No,No,No,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,No,No,No,0,3,2010,WD,138500


In [126]:
ames.to_csv('../data/clean_ames.csv')