### _Data Cleaning:_

In [None]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

In [4]:
#Read in data
pd.set_option('display.max_rows', None)

ames = pd.read_csv('../data/train.csv')

In [5]:
#View df
ames.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
ames.shape

(2051, 81)

In [8]:
ames.isnull().sum()

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Alley              0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
BsmtFin SF 2       0
Bsmt Unf SF        0
Total Bsmt SF      0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area  

Decided to take all my cleaning for the dataset and turn it into a function for readability and makes the notebook look more aestetically pleasing.

In [5]:
def ames_clean(df):
#Dealing with lot frontage null values.
#Most of the values are non-nan and the range for area is not that large, so I will fill in null values with mean
    df['Lot Frontage']= df['Lot Frontage'].fillna(round(df['Lot Frontage'].mean()))

    #Dealing with alley
    #Since many homes do not have alleys next to them,I decided just to drop
    df['Alley'] = df['Alley'].fillna('None')

    #Mas Vnr type going to convert Nan to none
    df['Mas Vnr Type'] = df['Mas Vnr Type'].fillna('None')

    #Mas Vnr Area convert Nan to 0 since I converted the type to none
    df['Mas Vnr Area']=df['Mas Vnr Area'].fillna(0)

    #BsmtQual null values
    #The NA = no basement so I will change to none and map later
    df['Bsmt Qual'] = df['Bsmt Qual'].fillna("None")

    #Bsmt Cond null values
    #same count as b_qual null so I will change to none and map late
    ames['Bsmt Cond'] = ames['Bsmt Cond'].fillna('None') 

    #Bsmt Exposure null values
    #will change nan to no's
    df['Bsmt Exposure'] =df['Bsmt Exposure'].fillna('No')

    #Bsmnt finish type 1
    #Going to change finish to unf since technically there is no finish
    df['BsmtFin Type 1'] =df['BsmtFin Type 1'].fillna("Unf")  


    #Bsmnt finish type 2
    #Going to change finish to unf. same as above
    df['BsmtFin Type 2'] =df['BsmtFin Type 2'].fillna("Unf")

    #BsmtFin null. will convert to 0
    df['BsmtFin SF 1']= df['BsmtFin SF 1'].fillna(0) 


    #BsmtFin SF 2
    df['BsmtFin SF 2'] = df['BsmtFin SF 2'].fillna(0)

    #Bsmt Unf SF
    df['Bsmt Unf SF'] = df['Bsmt Unf SF'].fillna(0)

    #Total Bsmt SF will convert NA to 0
    df['Total Bsmt SF'] = df['Total Bsmt SF'].fillna(0)

    #Bsmt Full Bath will convert NA to 0
    df['Bsmt Full Bath'] = df['Bsmt Full Bath'].fillna(0)

    #Bsmt Half Bath 
    df['Bsmt Half Bath'].head()
    #Will convert NA to 0
    df['Bsmt Half Bath'] = df['Bsmt Half Bath'].fillna(0)

    #Skippung to last of the smaller nulls
    #Since it is car count and area I will convert both to 0.
    #No garage
    df['Garage Cars'] = df['Garage Cars'].fillna(0)           
    df['Garage Area'] = df['Garage Area'].fillna(0)   

    #Fireplaces were marked as Nan if there was no fp..
    #I will convert to No for now and map later
    df['Fireplace Qu'] =  df['Fireplace Qu'].fillna('No') 

    #Pool qc is the same as the above. Nan means no. Will convert to no
    df['Pool QC'] = df['Pool QC'].fillna('No')

    #Fence
    df['Fence'] = df['Fence'].fillna('No')

    #Misc Features
    df['Misc Feature'] = df['Misc Feature'].fillna('No')

    #Garage Nulls. All of the null values = that there is no garage. 
    #Will convert all ordinals to None and continuous/discrete to 0
    df['Garage Finish'] = df['Garage Finish'].fillna('None')

    df['Garage Qual'] = df['Garage Finish'].fillna('None')

    df['Garage Cond'] = df['Garage Cond'].fillna('None')

    df['Garage Type'] = df['Garage Type'].fillna('None')

    df['Garage Cars'] = df['Garage Cars'].fillna(0)

    df['Garage Area'] = df['Garage Area'].fillna(0)

    # Will use the mean of year built to fill in columns instead of dropping
    df['Garage Yr Blt'] = df['Garage Yr Blt'].fillna(round(df['Garage Yr Blt'].mean()))

    return df.head()

In [6]:
ames_clean(ames)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,69.0,13517,Pave,,IR1,Lvl,...,0,0,No,No,No,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,No,No,No,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,No,No,No,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,No,No,No,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,No,No,No,0,3,2010,WD,138500


In [7]:
ames.isnull().sum()

Id                 0
PID                0
MS SubClass        0
MS Zoning          0
Lot Frontage       0
Lot Area           0
Street             0
Alley              0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Mas Vnr Type       0
Mas Vnr Area       0
Exter Qual         0
Exter Cond         0
Foundation         0
Bsmt Qual          0
Bsmt Cond          0
Bsmt Exposure      0
BsmtFin Type 1     0
BsmtFin SF 1       0
BsmtFin Type 2     0
BsmtFin SF 2       0
Bsmt Unf SF        0
Total Bsmt SF      0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area  

In [29]:
ames.to_csv('../data/clean_ames.csv')