In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

### Load the Data

In [64]:
#relative path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_path = os.path.join(parent_dir, 'Data', 'train.csv')
test_path = os.path.join(parent_dir, 'Data', 'test.csv')

# Load data and test data
data = pd.read_csv(data_path, index_col = 0)
test = pd.read_csv(test_path, index_col = 0)

# Separate data into X and Y
y = data.SalePrice
X = data.drop("SalePrice", axis = 1)

### Examine All Features

In [65]:
print("The shape of X is", X.shape)
print("The shape of test set is", test.shape)
print("The columns of X are:\n", X.columns)

The shape of X is (1460, 79)
The shape of test set is (1459, 79)
The columns of X are:
 Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageF

In [66]:
num_name = X.select_dtypes(include=[np.number]).columns
cat_name = X.select_dtypes(include=[np.object_]).columns
print(num_name.shape[0], "numerical variables")
print(cat_name.shape[0], "categorical variables")

36 numerical variables
43 categorical variables


### Missing Value

In [67]:
# Number of empty entries in each column
# Data set
col_missing = X.isnull().sum(axis = 0)
col_missing = col_missing[col_missing > 0]
print(col_missing)
print()

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64



In [68]:
# Test set
col_missing_test = test.isnull().sum(axis = 0)
col_missing_test = col_missing_test[col_missing_test > 0]
print(col_missing_test)
len(col_missing_test)

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64


33

In [69]:
# All rows have missing entry in data set
row_with_missing = [row for index, row in X.iterrows() if row.isnull().any()]
len(row_with_missing)

1460

### Three Feature Sets

#### Set 1: Numerical Variables without missing entries

In [70]:
num_no_missing = num_name.difference(col_missing.index).difference(col_missing_test.index)
len(num_no_missing)

25

#### Set 2: All Numerical Variables

In [78]:
len(num_name)

36

#### Set 3: Numerical and Categorical Variables without missing entries

In [71]:
var_no_missing = X.columns.difference(col_missing.index).difference(col_missing_test.index)
len(var_no_missing)

45

#### Set 4: Final Data

In [72]:
# TBD: address some missing values, ordinal encoding etc. 

### Save Clean Data

In [79]:
clean_data_path = os.path.join(parent_dir, 'Data', 'train_clean.csv')
clean_test_path = os.path.join(parent_dir, 'Data', 'test_clean.csv')

clean_feat = list(num_name) + [y.name]
data[clean_feat].to_csv(clean_data_path)
test[list(num_name)].to_csv(clean_test_path)