In [25]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In this notebook, I clean the test data the same exact way I cleaned by training data. At the end, both my training, testing data sets will have consistency in data types and how null values changed.

In [3]:
#load in the training CSV
test_csv = pd.read_csv('./datasets/test.csv')

In [28]:
#Select same columns I chose for my training data 
test_clean = test_csv[['Id',
                       'Lot Area',
                       'Overall Qual',
                       'Year Built', 
                       'Year Remod/Add',
                       'Mas Vnr Area',
                       'Exter Qual',
                       'Bsmt Qual',         
                       'Gr Liv Area',
                       'Full Bath',
                       'Half Bath',
                       'Total Bsmt SF',
                       'Garage Cars',
                       'Garage Area',
                       'Fireplaces',
                       'TotRms AbvGrd',
                       'Kitchen Qual',
                       ]]

In [29]:
test_clean.dtypes

Id                  int64
Lot Area            int64
Overall Qual        int64
Year Built          int64
Year Remod/Add      int64
Mas Vnr Area      float64
Exter Qual         object
Bsmt Qual          object
Gr Liv Area         int64
Full Bath           int64
Half Bath           int64
Total Bsmt SF       int64
Garage Cars         int64
Garage Area         int64
Fireplaces          int64
TotRms AbvGrd       int64
Kitchen Qual       object
dtype: object

In [30]:
test_clean.isnull().sum()

Id                 0
Lot Area           0
Overall Qual       0
Year Built         0
Year Remod/Add     0
Mas Vnr Area       1
Exter Qual         0
Bsmt Qual         25
Gr Liv Area        0
Full Bath          0
Half Bath          0
Total Bsmt SF      0
Garage Cars        0
Garage Area        0
Fireplaces         0
TotRms AbvGrd      0
Kitchen Qual       0
dtype: int64

In [31]:
#change ordinal values to nominal, objects to floats if possible

In [43]:
#change basement quality to a nominal value(rating scale) 
test_clean.loc[test_clean['Exter Qual'] == 'Po' , 'Exter Qual'] = 1
test_clean.loc[test_clean['Exter Qual'] == 'Fa' , 'Exter Qual'] = 2
test_clean.loc[test_clean['Exter Qual'] == 'TA' , 'Exter Qual'] = 3
test_clean.loc[test_clean['Exter Qual'] == 'Gd' , 'Exter Qual'] = 4
test_clean.loc[test_clean['Exter Qual'] == 'Ex' , 'Exter Qual'] = 5
test_clean.loc[test_clean['Exter Qual'].isnull(), 'Exter Qual'] = 0

test_clean['Exter Qual'].isnull().sum()

0

In [44]:
test_clean['Exter Qual'].value_counts()

3    552
4    292
5     26
2      9
Name: Exter Qual, dtype: int64

In [45]:
#change basement quality to a nominal value(rating scale) 
test_clean.loc[test_clean['Bsmt Qual'] == 'Po' , 'Bsmt Qual'] = 1
test_clean.loc[test_clean['Bsmt Qual'] == 'Fa' , 'Bsmt Qual'] = 2
test_clean.loc[test_clean['Bsmt Qual'] == 'TA' , 'Bsmt Qual'] = 3
test_clean.loc[test_clean['Bsmt Qual'] == 'Gd' , 'Bsmt Qual'] = 4
test_clean.loc[test_clean['Bsmt Qual'] == 'Ex' , 'Bsmt Qual'] = 5
test_clean.loc[test_clean['Bsmt Qual'].isnull(), 'Bsmt Qual'] = 0

test_clean['Bsmt Qual'].isnull().sum()

0

In [46]:
test_clean['Bsmt Qual'].value_counts()

3    396
4    355
5     74
2     28
0     25
1      1
Name: Bsmt Qual, dtype: int64

In [47]:
#change kitchen quality to int values
test_clean.loc[test_clean['Kitchen Qual'] == 'Po' , 'Kitchen Qual'] = 1
test_clean.loc[test_clean['Kitchen Qual'] == 'Fa' , 'Kitchen Qual'] = 2
test_clean.loc[test_clean['Kitchen Qual'] == 'TA' , 'Kitchen Qual'] = 3
test_clean.loc[test_clean['Kitchen Qual'] == 'Gd' , 'Kitchen Qual'] = 4
test_clean.loc[test_clean['Kitchen Qual'] == 'Ex' , 'Kitchen Qual'] = 5
test_clean.loc[test_clean['Kitchen Qual'].isnull(), 'Kitchen Qual'] = 0

test_clean['Kitchen Qual'].isnull().sum()

0

In [42]:
test_clean['Kitchen Qual'].value_counts()

3    447
4    354
5     54
2     23
1      1
Name: Kitchen Qual, dtype: int64

In [48]:
#Change null values to zero
test_clean.loc[test_clean["Mas Vnr Area"].isnull(), 'Mas Vnr Area'] = 0
test_clean['Mas Vnr Area'].isnull().sum()

0

In [49]:
test_clean.loc[test_clean['Total Bsmt SF'].isnull(), 'Total Bsmt SF'] = 0
test_clean['Total Bsmt SF'].isnull().sum()

0

In [50]:
test_clean.loc[test_clean['Garage Cars'].isnull(), 'Garage Cars'] = 0
test_clean['Garage Cars'].isnull().sum()

0

In [51]:
test_clean.loc[test_clean['Garage Area'].isnull(), 'Garage Area'] = 0
test_clean['Garage Area'].isnull().sum()

0

In [52]:
test_clean.isnull().sum()

Id                0
Lot Area          0
Overall Qual      0
Year Built        0
Year Remod/Add    0
Mas Vnr Area      0
Exter Qual        0
Bsmt Qual         0
Gr Liv Area       0
Full Bath         0
Half Bath         0
Total Bsmt SF     0
Garage Cars       0
Garage Area       0
Fireplaces        0
TotRms AbvGrd     0
Kitchen Qual      0
dtype: int64

In [54]:
#check to see if test data types are consistent with training data
test_clean.dtypes 

Id                  int64
Lot Area            int64
Overall Qual        int64
Year Built          int64
Year Remod/Add      int64
Mas Vnr Area      float64
Exter Qual          int64
Bsmt Qual           int64
Gr Liv Area         int64
Full Bath           int64
Half Bath           int64
Total Bsmt SF       int64
Garage Cars         int64
Garage Area         int64
Fireplaces          int64
TotRms AbvGrd       int64
Kitchen Qual        int64
dtype: object

In [56]:
#export to new file to use in testing my prediction model
test_clean.to_csv('./datasets/test_final.csv')