
# Capstone project 1: House prices advanced regression

In [1]:
#-----------------------------------------------------------------------------#
import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
from ipywidgets import interact, widgets
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
# Load training data from csv files
df = pd.read_csv('./data/train.csv', index_col=['Id']) 

In [3]:
df.shape

(1460, 80)

## Data Cleaning, Wrangling and EDA (Exploratory Data Analysis)

In [4]:
#get overall information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

In [4]:
# change object to categorical type
# Categorical columns
CatCols = [col for col in df.columns if df[col].dtype == 'object']
len(CatCols)

43

In [5]:
# 'MSSubClass' is also categorical columns with dytpe = int64

CatCols.append('MSSubClass')
df['MSSubClass'] = df['MSSubClass'].astype('str')

In [6]:
# All numerical columns
NumCols = [col for col in df.columns if col not in CatCols]
len(NumCols)

36

In [7]:
# Fill NA
for col in NumCols:
    df[col] = df[col].fillna(0)
for col in CatCols:
    df[col] = df[col].fillna('0')

In [8]:
# Re-organize columns with numerical together and categorical together

df_cat = df[CatCols].astype('category')
df_num = df[NumCols]
df = df_cat.join(df_num,on='Id')
df.head(1)

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,0,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0,61,0,0,0,0,0,2,2008,208500


In [9]:
#change the year to age
for col in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    df[col] = df['YrSold'] - df[col]

In [10]:
# for GarageYrBlt data, fill NA with 0 is not right
# because 0 means very new, should fill with maximum value to indicate no Garage
maxage = df.GarageYrBlt.max() + 10
df.GarageYrBlt = df['GarageYrBlt'].replace(0, maxage)

In [11]:
# Add one more column for selling time
SoldTime = df['YrSold'].astype(str)+'-'+df['MoSold'].astype(str)
df['SoldTime'] = pd.to_datetime(SoldTime)
df['SoldTime'].head()

Id
1   2008-02-01
2   2007-05-01
3   2008-09-01
4   2006-02-01
5   2008-12-01
Name: SoldTime, dtype: datetime64[ns]

In [12]:
#explore data range
df['YrSold'].min(),df['YrSold'].max()

(2006, 2010)

In [13]:
pd.DataFrame(df).to_csv('./data/df_clean.csv')

In [14]:
Y = df['SalePrice']
Y.columns = ['SalePrice']
X = df.drop(['SalePrice','SoldTime'],axis=1)
pd.DataFrame(X).to_csv('./data/X.csv')

In [15]:
xcols = ['x'+str(i) for i in range(len(CatCols))]
cols = dict(zip(xcols,CatCols))
with open('column_map.json','w') as file:
    json.dump(cols,file)

In [16]:
cat_mask = (X.dtypes == 'category')

In [17]:
# Use OneHotEncoder to preprocess Categorical data

le = LabelEncoder() 
X_le = pd.DataFrame({})
le_map = {}
for col in CatCols:
    X_le[col] = le.fit(X[col])
    le_map[col]=dict(zip((le.classes_).astype('str'),[i for i in range(len(le.classes_))]))


In [18]:
with open('le_map.json','w') as file:
    json.dump(le_map,file)

In [19]:
for col in CatCols:
    X_le[col] = X[col].map(le_map[col])
X_le[NumCols[:-1]] = X[NumCols[:-1]]
X_le.shape

(1460, 79)

In [20]:
pd.DataFrame(X_le).to_csv('./data/X_le.csv')

In [21]:
ohe = OneHotEncoder(categorical_features = cat_mask,sparse=False)
X_ohe = ohe.fit_transform(X_le)
X_ohe.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(1460, 318)

In [22]:
ohe_column_names = ohe.get_feature_names()

In [23]:
column_names = np.concatenate((ohe_column_names,NumCols))
len(column_names)

319

In [24]:
X_input = pd.DataFrame(X_ohe,columns=column_names[:318])
y_input = pd.DataFrame(Y,columns=['SalePrice'])
pd.DataFrame(X_input).to_csv('./data/X_input.csv')
pd.DataFrame(y_input).to_csv('./data/y_input.csv')

In [25]:
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X_scale = scaler.fit_transform(X_input)
y_scale = scaler.fit_transform(y_input)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, test_size=0.3, random_state=7)

In [27]:
# Save all processed data into folder

pd.DataFrame(X_train,columns=column_names[:318]).to_csv('./data/X_train.csv')
pd.DataFrame(X_test,columns=column_names[:318]).to_csv('./data/X_test.csv')
pd.DataFrame(y_train,columns=['SalePrice']).to_csv('./data/y_train.csv')
pd.DataFrame(y_test,columns=['SalePrice']).to_csv('./data/y_test.csv')

In [28]:
y_scale_factor = scaler.scale_

In [30]:
y_scale_factor

array([1.38869601e-06])