In [1]:
from pathlib import Path

raw_data_dir = Path('../data/raw/')
interim_data_dir = Path('../data/interim/')
processed_data_dir = Path('../data/processed/')
file_name = 'train.csv'
file_path = interim_data_dir / file_name

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
with open(raw_data_dir / 'data_description.txt', 'rt' ) as f:
    print(f.read())

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

In [4]:
df  = pd.read_csv(file_path)

In [5]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,,0,12,2008,WD,Normal,250000


In [6]:
df['MSSubClass']

0       60
1       20
2       60
3       70
4       60
        ..
1437    60
1438    20
1439    70
1440    20
1441    20
Name: MSSubClass, Length: 1442, dtype: int64

In [7]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['MSSubClass']

0       60
1       20
2       60
3       70
4       60
        ..
1437    60
1438    20
1439    70
1440    20
1441    20
Name: MSSubClass, Length: 1442, dtype: object

In [8]:
pd.get_dummies(df['MSSubClass'], drop_first=True)

Unnamed: 0,160,180,190,20,30,40,45,50,60,70,75,80,85,90
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1438,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1439,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1440,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [9]:
object_df = df.select_dtypes(include='object')
numeric_df = df.select_dtypes(exclude='object')

df_objects_dummies = pd.get_dummies(object_df, drop_first=True)
df_objects_dummies

Unnamed: 0,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1438,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1439,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
1440,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [10]:
df_encoded = pd.concat((numeric_df, df_objects_dummies), axis=1)
df_encoded.info()
df_encoded

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1442 entries, 0 to 1441
Columns: 255 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(3), int64(33), uint8(219)
memory usage: 714.1 KB


Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,0,0,0,0,1,0,0,0,1,0
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,0,0,0,0,1,0,0,0,1,0
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,0,0,0,0,1,0,0,0,1,0
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,0,0,0,0,1,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,62.0,7917,6,5,1999,2000,0.0,0,0,953,...,0,0,0,0,1,0,0,0,1,0
1438,85.0,13175,6,6,1978,1988,119.0,790,163,589,...,0,0,0,0,1,0,0,0,1,0
1439,66.0,9042,7,9,1941,2006,0.0,275,0,877,...,0,0,0,0,1,0,0,0,1,0
1440,68.0,9717,5,6,1950,1996,0.0,49,1029,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
df_encoded.to_csv(processed_data_dir / file_name, index=False)