We will be performing all the below steps in Feature Engineering

1. Missing values
2. Temporal variables
3. Categorical variables: remove rare labels
4. Standarise the values of the variables to the same range

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv("../house_price.csv")

In [3]:
train.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [4]:
# There is a chance of data leakage, so it is mandatory to split data before performing feature engineering

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, train['SalePrice'], test_size=0.1, random_state=0)

In [5]:
X_train.shape, X_test.shape

((1314, 81), (146, 81))

In [6]:
train = X_test

## Missing Values
### Categorical Features with missing values

In [7]:
features_nan = [feature for feature in train.columns if train[feature].isnull().sum()>0 and train[feature].dtype=='O']
print("Number of missing Categorical features are {}".format(len(features_nan)))

Number of missing Categorical features are 15


In [8]:
for feature in features_nan:
    print("Feature {} contains {} % nan values".format(feature, round(train[feature].isnull().mean(), 4)))

Feature Alley contains 0.9315 % nan values
Feature MasVnrType contains 0.0137 % nan values
Feature BsmtQual contains 0.0342 % nan values
Feature BsmtCond contains 0.0342 % nan values
Feature BsmtExposure contains 0.0342 % nan values
Feature BsmtFinType1 contains 0.0342 % nan values
Feature BsmtFinType2 contains 0.0342 % nan values
Feature FireplaceQu contains 0.4726 % nan values
Feature GarageType contains 0.0479 % nan values
Feature GarageFinish contains 0.0479 % nan values
Feature GarageQual contains 0.0479 % nan values
Feature GarageCond contains 0.0479 % nan values
Feature PoolQC contains 0.9932 % nan values
Feature Fence contains 0.7466 % nan values
Feature MiscFeature contains 0.9795 % nan values


In [9]:
#REplace all the nan values in a categorical feature with a label

def replace_cat_features(train, features_nan):
    data = train.copy()
    data[features_nan] = data[features_nan].fillna("Missing")
    return data

train = replace_cat_features(train, features_nan)
train[features_nan].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [10]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
529,530,20,RL,,32668,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,Norm,Norm,1Fam,1Story,6,3,1957,1975,Hip,CompShg,Wd Sdng,Stone,Missing,,Gd,TA,PConc,TA,TA,No,Rec,1219,Unf,0,816,2035,GasA,TA,Y,SBrkr,2515,0,0,2515,1,0,3,0,4,2,TA,9,Maj1,2,TA,Attchd,1975.0,RFn,2,484,TA,TA,Y,0,0,200,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Alloca,200624
491,492,50,RL,79.0,9490,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1.5Fin,6,7,1941,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,403,Rec,165,238,806,GasA,TA,Y,FuseA,958,620,0,1578,1,0,1,0,3,1,Fa,5,Typ,2,TA,Attchd,1941.0,Unf,1,240,TA,TA,Y,0,0,32,0,0,0,Missing,MnPrv,Missing,0,8,2006,WD,Normal,133000
459,460,50,RL,,7015,Pave,Missing,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,Norm,Norm,1Fam,1.5Fin,5,4,1950,1950,Gable,CompShg,MetalSd,MetalSd,BrkCmn,161.0,TA,TA,CBlock,TA,TA,No,LwQ,185,Unf,0,524,709,GasA,TA,Y,SBrkr,979,224,0,1203,1,0,1,0,3,1,Gd,5,Typ,1,TA,Detchd,1950.0,Unf,1,352,TA,TA,Y,0,0,248,0,0,0,Missing,Missing,Missing,0,7,2009,WD,Normal,110000
279,280,60,RL,83.0,10005,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,Norm,Norm,1Fam,2Story,7,5,1977,1977,Hip,CompShg,Plywood,Plywood,BrkFace,299.0,TA,TA,CBlock,Gd,TA,No,BLQ,392,Unf,0,768,1160,GasA,Ex,Y,SBrkr,1156,866,0,2022,0,0,2,1,4,1,TA,8,Typ,1,TA,Attchd,1977.0,Fin,2,505,TA,TA,Y,288,117,0,0,0,0,Missing,Missing,Missing,0,3,2008,WD,Normal,192000
655,656,160,RM,21.0,1680,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2Story,6,5,1971,1971,Gable,CompShg,HdBoard,ImStucc,BrkFace,381.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,525,525,GasA,TA,Y,SBrkr,525,567,0,1092,0,0,1,1,3,1,TA,6,Typ,0,Missing,Detchd,1971.0,Unf,1,264,TA,TA,Y,0,0,0,0,0,0,Missing,Missing,Missing,0,3,2010,WD,Family,88000


### Numerical Features with missing values

In [11]:
numerical_nan = [feature for feature in train.columns if train[feature].isnull().sum()>0 and train[feature].dtype!='O']
print("count of numerical nan feature is {}".format(len(numerical_nan)))

count of numerical nan feature is 3


In [12]:
# find the % of missing value for this numerical feature
for feature in numerical_nan:
    print("{} contains {}% nan values".format(feature, round(train[feature].isnull().mean()*100,4)) )

LotFrontage contains 17.8082% nan values
MasVnrArea contains 1.3699% nan values
GarageYrBlt contains 4.7945% nan values


In [13]:
#REplace Numerical missing values

for feature in numerical_nan:
    median = train[feature].median()
    
    train[feature+ "_nan"] = np.where(train[feature].isnull(), 1,0)
    train[feature].fillna(median, inplace=True)

    
train[numerical_nan].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [14]:
np.where(train['GarageYrBlt'].isnull(), 1,0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
train.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
529,530,20,RL,70.0,32668,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,Norm,Norm,1Fam,1Story,6,3,1957,1975,Hip,CompShg,Wd Sdng,Stone,Missing,0.0,Gd,TA,PConc,TA,TA,No,Rec,1219,Unf,0,816,2035,GasA,TA,Y,SBrkr,2515,0,0,2515,1,0,3,0,4,2,TA,9,Maj1,2,TA,Attchd,1975.0,RFn,2,484,TA,TA,Y,0,0,200,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Alloca,200624,1,1,0
491,492,50,RL,79.0,9490,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1.5Fin,6,7,1941,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,403,Rec,165,238,806,GasA,TA,Y,FuseA,958,620,0,1578,1,0,1,0,3,1,Fa,5,Typ,2,TA,Attchd,1941.0,Unf,1,240,TA,TA,Y,0,0,32,0,0,0,Missing,MnPrv,Missing,0,8,2006,WD,Normal,133000,0,0,0
459,460,50,RL,70.0,7015,Pave,Missing,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,Norm,Norm,1Fam,1.5Fin,5,4,1950,1950,Gable,CompShg,MetalSd,MetalSd,BrkCmn,161.0,TA,TA,CBlock,TA,TA,No,LwQ,185,Unf,0,524,709,GasA,TA,Y,SBrkr,979,224,0,1203,1,0,1,0,3,1,Gd,5,Typ,1,TA,Detchd,1950.0,Unf,1,352,TA,TA,Y,0,0,248,0,0,0,Missing,Missing,Missing,0,7,2009,WD,Normal,110000,1,0,0
279,280,60,RL,83.0,10005,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,Norm,Norm,1Fam,2Story,7,5,1977,1977,Hip,CompShg,Plywood,Plywood,BrkFace,299.0,TA,TA,CBlock,Gd,TA,No,BLQ,392,Unf,0,768,1160,GasA,Ex,Y,SBrkr,1156,866,0,2022,0,0,2,1,4,1,TA,8,Typ,1,TA,Attchd,1977.0,Fin,2,505,TA,TA,Y,288,117,0,0,0,0,Missing,Missing,Missing,0,3,2008,WD,Normal,192000,0,0,0
655,656,160,RM,21.0,1680,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2Story,6,5,1971,1971,Gable,CompShg,HdBoard,ImStucc,BrkFace,381.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,525,525,GasA,TA,Y,SBrkr,525,567,0,1092,0,0,1,1,3,1,TA,6,Typ,0,Missing,Detchd,1971.0,Unf,1,264,TA,TA,Y,0,0,0,0,0,0,Missing,Missing,Missing,0,3,2010,WD,Family,88000,0,0,0
1013,1014,30,RM,60.0,7200,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,4,1910,2006,Hip,CompShg,MetalSd,Stucco,,0.0,TA,TA,BrkTil,TA,TA,No,ALQ,247,Rec,465,310,1022,GasW,TA,N,SBrkr,1022,0,0,1022,1,0,1,0,2,1,TA,4,Maj2,0,Missing,Detchd,1956.0,Unf,1,280,TA,TA,Y,0,30,226,0,0,0,Missing,Missing,Missing,0,6,2009,WD,Normal,85000,0,0,0
1403,1404,20,RL,49.0,15256,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Somerst,RRAn,Norm,1Fam,1Story,8,5,2007,2007,Gable,CompShg,VinylSd,VinylSd,Stone,84.0,Gd,TA,PConc,Gd,TA,Gd,GLQ,929,Unf,0,556,1485,GasA,Ex,Y,SBrkr,1464,0,0,1464,1,0,2,0,3,1,Gd,6,Typ,0,Missing,Attchd,2007.0,Unf,3,754,TA,TA,Y,168,160,0,0,0,0,Missing,Missing,Missing,0,8,2007,WD,Normal,282922,0,0,0
601,602,50,RM,50.0,9000,Pave,Missing,Reg,Bnk,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1.5Fin,6,6,1937,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,Gd,PConc,TA,TA,No,Unf,0,Unf,0,780,780,GasA,TA,Y,SBrkr,780,595,0,1375,0,0,1,1,3,1,Gd,6,Typ,1,Gd,Detchd,1979.0,Unf,1,544,TA,TA,P,0,162,0,0,126,0,Missing,Missing,Missing,0,12,2007,WD,Normal,141000,0,0,0
1182,1183,60,RL,160.0,15623,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NoRidge,Norm,Norm,1Fam,2Story,10,5,1996,1996,Hip,CompShg,Wd Sdng,ImStucc,,0.0,Gd,TA,PConc,Ex,TA,Av,GLQ,2096,Unf,0,300,2396,GasA,Ex,Y,SBrkr,2411,2065,0,4476,1,0,3,1,4,1,Ex,10,Typ,2,TA,Attchd,1996.0,Fin,3,813,TA,TA,Y,171,78,0,0,0,555,Ex,MnPrv,Missing,0,7,2007,WD,Abnorml,745000,0,0,0
687,688,160,FV,70.0,5105,Pave,Missing,IR2,Lvl,AllPub,FR2,Gtl,Somerst,Norm,Norm,TwnhsE,2Story,7,5,2004,2004,Gable,CompShg,MetalSd,MetalSd,,0.0,Gd,TA,PConc,Gd,TA,No,GLQ,239,Unf,0,312,551,GasA,Ex,Y,SBrkr,551,551,0,1102,0,0,2,1,2,1,Gd,4,Typ,0,Missing,Detchd,2004.0,Unf,2,480,TA,TA,Y,0,60,0,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Normal,148800,1,0,0


### Temporal VAriables

In [16]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    train[feature] = train['YrSold'] - train[feature]

In [17]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
529,530,20,RL,70.0,32668,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,Norm,Norm,1Fam,1Story,6,3,50,32,Hip,CompShg,Wd Sdng,Stone,Missing,0.0,Gd,TA,PConc,TA,TA,No,Rec,1219,Unf,0,816,2035,GasA,TA,Y,SBrkr,2515,0,0,2515,1,0,3,0,4,2,TA,9,Maj1,2,TA,Attchd,32.0,RFn,2,484,TA,TA,Y,0,0,200,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Alloca,200624,1,1,0
491,492,50,RL,79.0,9490,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1.5Fin,6,7,65,56,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,403,Rec,165,238,806,GasA,TA,Y,FuseA,958,620,0,1578,1,0,1,0,3,1,Fa,5,Typ,2,TA,Attchd,65.0,Unf,1,240,TA,TA,Y,0,0,32,0,0,0,Missing,MnPrv,Missing,0,8,2006,WD,Normal,133000,0,0,0
459,460,50,RL,70.0,7015,Pave,Missing,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,Norm,Norm,1Fam,1.5Fin,5,4,59,59,Gable,CompShg,MetalSd,MetalSd,BrkCmn,161.0,TA,TA,CBlock,TA,TA,No,LwQ,185,Unf,0,524,709,GasA,TA,Y,SBrkr,979,224,0,1203,1,0,1,0,3,1,Gd,5,Typ,1,TA,Detchd,59.0,Unf,1,352,TA,TA,Y,0,0,248,0,0,0,Missing,Missing,Missing,0,7,2009,WD,Normal,110000,1,0,0
279,280,60,RL,83.0,10005,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,Norm,Norm,1Fam,2Story,7,5,31,31,Hip,CompShg,Plywood,Plywood,BrkFace,299.0,TA,TA,CBlock,Gd,TA,No,BLQ,392,Unf,0,768,1160,GasA,Ex,Y,SBrkr,1156,866,0,2022,0,0,2,1,4,1,TA,8,Typ,1,TA,Attchd,31.0,Fin,2,505,TA,TA,Y,288,117,0,0,0,0,Missing,Missing,Missing,0,3,2008,WD,Normal,192000,0,0,0
655,656,160,RM,21.0,1680,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2Story,6,5,39,39,Gable,CompShg,HdBoard,ImStucc,BrkFace,381.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,525,525,GasA,TA,Y,SBrkr,525,567,0,1092,0,0,1,1,3,1,TA,6,Typ,0,Missing,Detchd,39.0,Unf,1,264,TA,TA,Y,0,0,0,0,0,0,Missing,Missing,Missing,0,3,2010,WD,Family,88000,0,0,0


### Numeical features - we normalize using log normal as the values are skewed



In [18]:
numerical_features = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in numerical_features:
    train[feature] = np.log(train[feature])

In [19]:
train[numerical_features].head()

Unnamed: 0,LotFrontage,LotArea,1stFlrSF,GrLivArea,SalePrice
529,4.248495,10.394151,7.830028,7.830028,12.209188
491,4.369448,9.157994,6.864848,7.363914,11.798104
459,4.248495,8.855806,6.886532,7.092574,11.608236
279,4.418841,9.21084,7.052721,7.611842,12.165251
655,3.044522,7.426549,6.263398,6.995766,11.385092


### HAndling Rare Categorical Feature

In [20]:
# Determine the rare ones
# If a catergory in a feature is less than 1%, then we replace such category with a label "RARE_NAN" 

categorical_features = [feature for feature in train.columns if train[feature].dtype=='O']



In [21]:
for feature in categorical_features:
    tempvar = train.groupby(feature)['SalePrice'].count()/len(train[feature])
    tempdf= tempvar[tempvar>0.01].index
    train[feature] = np.where(train[feature].isin(tempdf), train[feature], "RARENAN")
    

In [22]:
train.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
529,530,20,RL,4.248495,10.394151,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Crawfor,Norm,Norm,1Fam,1Story,6,3,50,32,Hip,CompShg,Wd Sdng,RARENAN,Missing,0.0,Gd,TA,PConc,TA,TA,No,Rec,1219,Unf,0,816,2035,GasA,TA,Y,SBrkr,7.830028,0,0,7.830028,1,0,3,0,4,2,TA,9,Maj1,2,TA,Attchd,32.0,RFn,2,484,TA,TA,Y,0,0,200,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Alloca,12.209188,1,1,0
491,492,50,RL,4.369448,9.157994,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1.5Fin,6,7,65,56,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,403,Rec,165,238,806,GasA,TA,Y,FuseA,6.864848,620,0,7.363914,1,0,1,0,3,1,Fa,5,Typ,2,TA,Attchd,65.0,Unf,1,240,TA,TA,Y,0,0,32,0,0,0,Missing,MnPrv,Missing,0,8,2006,WD,Normal,11.798104,0,0,0
459,460,50,RL,4.248495,8.855806,Pave,Missing,IR1,Bnk,AllPub,Corner,Gtl,BrkSide,Norm,Norm,1Fam,1.5Fin,5,4,59,59,Gable,CompShg,MetalSd,MetalSd,BrkCmn,161.0,TA,TA,CBlock,TA,TA,No,LwQ,185,Unf,0,524,709,GasA,TA,Y,SBrkr,6.886532,224,0,7.092574,1,0,1,0,3,1,Gd,5,Typ,1,TA,Detchd,59.0,Unf,1,352,TA,TA,Y,0,0,248,0,0,0,Missing,Missing,Missing,0,7,2009,WD,Normal,11.608236,1,0,0
279,280,60,RL,4.418841,9.21084,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,ClearCr,Norm,Norm,1Fam,2Story,7,5,31,31,Hip,CompShg,Plywood,Plywood,BrkFace,299.0,TA,TA,CBlock,Gd,TA,No,BLQ,392,Unf,0,768,1160,GasA,Ex,Y,SBrkr,7.052721,866,0,7.611842,0,0,2,1,4,1,TA,8,Typ,1,TA,Attchd,31.0,Fin,2,505,TA,TA,Y,288,117,0,0,0,0,Missing,Missing,Missing,0,3,2008,WD,Normal,12.165251,0,0,0
655,656,160,RM,3.044522,7.426549,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,RARENAN,Norm,Norm,Twnhs,2Story,6,5,39,39,Gable,CompShg,HdBoard,ImStucc,BrkFace,381.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,525,525,GasA,TA,Y,SBrkr,6.263398,567,0,6.995766,0,0,1,1,3,1,TA,6,Typ,0,Missing,Detchd,39.0,Unf,1,264,TA,TA,Y,0,0,0,0,0,0,Missing,Missing,Missing,0,3,2010,WD,Family,11.385092,0,0,0
1013,1014,30,RM,4.094345,8.881836,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,4,99,3,Hip,CompShg,MetalSd,Stucco,,0.0,TA,TA,BrkTil,TA,TA,No,ALQ,247,Rec,465,310,1022,GasW,TA,N,SBrkr,6.929517,0,0,6.929517,1,0,1,0,2,1,TA,4,RARENAN,0,Missing,Detchd,53.0,Unf,1,280,TA,TA,Y,0,30,226,0,0,0,Missing,Missing,Missing,0,6,2009,WD,Normal,11.350407,0,0,0
1403,1404,20,RL,3.89182,9.632728,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Somerst,RRAn,Norm,1Fam,1Story,8,5,0,0,Gable,CompShg,VinylSd,VinylSd,Stone,84.0,Gd,TA,PConc,Gd,TA,Gd,GLQ,929,Unf,0,556,1485,GasA,Ex,Y,SBrkr,7.288928,0,0,7.288928,1,0,2,0,3,1,Gd,6,Typ,0,Missing,Attchd,0.0,Unf,3,754,TA,TA,Y,168,160,0,0,0,0,Missing,Missing,Missing,0,8,2007,WD,Normal,12.552927,0,0,0
601,602,50,RM,3.912023,9.10498,Pave,Missing,Reg,Bnk,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1.5Fin,6,6,70,57,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,Gd,PConc,TA,TA,No,Unf,0,Unf,0,780,780,GasA,TA,Y,SBrkr,6.659294,595,0,7.226209,0,0,1,1,3,1,Gd,6,Typ,1,Gd,Detchd,28.0,Unf,1,544,TA,TA,P,0,162,0,0,126,0,Missing,Missing,Missing,0,12,2007,WD,Normal,11.856515,0,0,0
1182,1183,60,RL,5.075174,9.656499,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NoRidge,Norm,Norm,1Fam,2Story,10,5,11,11,Hip,CompShg,Wd Sdng,ImStucc,,0.0,Gd,TA,PConc,Ex,TA,Av,GLQ,2096,Unf,0,300,2396,GasA,Ex,Y,SBrkr,7.787797,2065,0,8.406485,1,0,3,1,4,1,Ex,10,Typ,2,TA,Attchd,11.0,Fin,3,813,TA,TA,Y,171,78,0,0,0,555,RARENAN,MnPrv,Missing,0,7,2007,WD,Abnorml,13.521139,0,0,0
687,688,160,FV,4.248495,8.537976,Pave,Missing,IR2,Lvl,AllPub,FR2,Gtl,Somerst,Norm,Norm,TwnhsE,2Story,7,5,3,3,Gable,CompShg,MetalSd,MetalSd,,0.0,Gd,TA,PConc,Gd,TA,No,GLQ,239,Unf,0,312,551,GasA,Ex,Y,SBrkr,6.311735,551,0,7.004882,0,0,2,1,2,1,Gd,4,Typ,0,Missing,Detchd,3.0,Unf,2,480,TA,TA,Y,0,60,0,0,0,0,Missing,Missing,Missing,0,3,2007,WD,Normal,11.910358,1,0,0


In [23]:
labels = ['a', 'b', 'c']
list(enumerate(labels, 0))

[(0, 'a'), (1, 'b'), (2, 'c')]

### HAndling Categorical feature

1. creating the ordinal labels and map to feature



In [24]:
train['GarageType'].unique()

array(['Attchd', 'Detchd', 'Missing', 'CarPort', 'BuiltIn', 'Basment'],
      dtype=object)

In [25]:
ordered_label = train.groupby('GarageType')['SalePrice'].mean().sort_values().index

In [26]:
ordered_label

Index(['CarPort', 'Missing', 'Detchd', 'Basment', 'Attchd', 'BuiltIn'], dtype='object', name='GarageType')

In [27]:
ordered_dict = {k:i for i,k in enumerate(ordered_label,0) }

In [28]:
ordered_dict

{'CarPort': 0,
 'Missing': 1,
 'Detchd': 2,
 'Basment': 3,
 'Attchd': 4,
 'BuiltIn': 5}

In [29]:
for feature in categorical_features:
    labels_ordered = train.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered = {k:i for i,k in enumerate(labels_ordered, 0)}
    train[feature] = train[feature].map(labels_ordered)
    

In [30]:
train[categorical_features].head(10)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
529,1,1,1,2,2,0,4,1,7,4,1,3,3,2,1,5,5,3,2,2,4,2,2,1,3,5,1,1,1,2,1,3,3,4,2,3,3,2,0,4,1,2,2
491,1,1,1,0,2,0,0,1,5,1,1,3,2,1,1,5,2,1,1,2,2,2,2,1,2,2,1,1,1,1,0,2,3,4,1,3,3,2,0,2,1,2,3
459,1,1,1,2,0,0,1,1,0,4,1,3,2,1,1,2,3,0,1,2,2,2,2,1,1,5,1,1,1,2,2,2,3,2,1,3,3,2,0,4,1,2,3
279,1,1,1,0,2,0,0,1,12,4,1,3,5,2,1,3,6,2,1,2,2,3,2,1,2,5,1,3,1,2,1,2,3,4,3,3,3,2,0,4,1,2,3
655,0,1,1,0,2,0,0,1,3,4,1,0,5,1,1,6,10,2,1,2,2,2,2,1,5,5,1,1,1,2,1,2,1,2,1,3,3,2,0,4,1,2,0
1013,0,1,1,0,2,0,0,1,2,4,1,3,3,2,1,2,0,1,1,2,0,2,2,1,4,2,0,1,0,2,1,0,1,2,1,3,3,2,0,4,1,2,3
1403,1,1,1,2,2,0,4,1,13,3,1,3,3,1,1,7,8,4,2,2,4,3,2,2,6,5,1,3,1,2,2,2,1,4,1,3,3,2,0,4,1,2,3
601,0,1,1,0,0,0,0,1,1,4,1,3,2,1,1,5,2,1,1,1,4,2,2,1,5,5,1,1,1,2,2,2,4,2,1,3,3,0,0,4,1,2,3
1182,1,1,1,2,2,0,1,1,17,4,1,3,5,2,1,5,10,1,2,2,4,4,2,4,6,5,1,3,1,2,3,2,3,4,3,3,3,2,1,2,1,2,1
687,2,1,1,1,2,0,3,1,13,4,1,4,5,1,1,2,3,1,2,2,4,3,2,1,6,5,1,3,1,2,2,2,1,2,1,3,3,2,0,4,1,2,3


In [31]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
529,530,20,1,4.248495,10.394151,1,1,2,2,0,4,1,7,4,1,3,3,6,3,50,32,2,1,5,5,3,0.0,2,2,4,2,2,1,3,1219,5,0,816,2035,1,1,1,2,7.830028,0,0,7.830028,1,0,3,0,4,2,1,9,3,2,3,4,32.0,2,2,484,3,3,2,0,0,200,0,0,0,0,4,1,0,3,2007,2,2,12.209188,1,1,0
491,492,50,1,4.369448,9.157994,1,1,0,2,0,0,1,5,1,1,3,2,6,7,65,56,1,1,5,2,1,0.0,1,2,2,2,2,1,2,403,2,165,238,806,1,1,1,1,6.864848,620,0,7.363914,1,0,1,0,3,1,0,5,2,2,3,4,65.0,1,1,240,3,3,2,0,0,32,0,0,0,0,2,1,0,8,2006,2,3,11.798104,0,0,0
459,460,50,1,4.248495,8.855806,1,1,2,0,0,1,1,0,4,1,3,2,5,4,59,59,1,1,2,3,0,161.0,1,2,2,2,2,1,1,185,5,0,524,709,1,1,1,2,6.886532,224,0,7.092574,1,0,1,0,3,1,2,5,2,1,3,2,59.0,1,1,352,3,3,2,0,0,248,0,0,0,0,4,1,0,7,2009,2,3,11.608236,1,0,0
279,280,60,1,4.418841,9.21084,1,1,0,2,0,0,1,12,4,1,3,5,7,5,31,31,2,1,3,6,2,299.0,1,2,2,3,2,1,2,392,5,0,768,1160,1,3,1,2,7.052721,866,0,7.611842,0,0,2,1,4,1,1,8,2,1,3,4,31.0,3,2,505,3,3,2,288,117,0,0,0,0,0,4,1,0,3,2008,2,3,12.165251,0,0,0
655,656,160,0,3.044522,7.426549,1,1,0,2,0,0,1,3,4,1,0,5,6,5,39,39,1,1,6,10,2,381.0,1,2,2,2,2,1,5,0,5,0,525,525,1,1,1,2,6.263398,567,0,6.995766,0,0,1,1,3,1,1,6,2,0,1,2,39.0,1,1,264,3,3,2,0,0,0,0,0,0,0,4,1,0,3,2010,2,0,11.385092,0,0,0


In [32]:
scaling_feature = [feature for feature in train.columns if feature not in ['Id','SalePerice']]
len(scaling_feature)

83

### Feature Scaling

In [33]:
from sklearn.preprocessing import MinMaxScaler

In [34]:
scaler =  MinMaxScaler()
scaler.fit(train[scaling_feature])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [35]:
scaler.transform(train[scaling_feature])

array([[0.        , 0.5       , 0.56938004, ..., 1.        , 1.        ,
        0.        ],
       [0.17647059, 0.5       , 0.62658067, ..., 0.        , 0.        ,
        0.        ],
       [0.17647059, 0.5       , 0.56938004, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.5       , 0.50429646, ..., 0.        , 0.        ,
        0.        ],
       [0.41176471, 0.5       , 0.49647946, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.5       , 0.56938004, ..., 1.        , 0.        ,
        0.        ]])

In [36]:
#Transform train and test set and add on ID and SalePrice column

data = pd.concat([train[['Id', 'SalePrice']].reset_index(drop=True),
                pd.DataFrame(scaler.transform(train[scaling_feature]), columns= scaling_feature)],
                axis=1 )

In [37]:
data.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice.1,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan
0,530,12.209188,0.0,0.5,0.56938,1.0,1.0,0.5,1.0,0.666667,0.0,1.0,1.0,0.388889,1.0,1.0,0.75,0.6,0.428571,0.0,0.393701,0.542373,1.0,1.0,0.625,0.5,0.75,0.0,0.666667,1.0,1.0,0.5,0.666667,0.25,0.5,0.581584,1.0,0.0,0.349315,0.849332,1.0,0.333333,1.0,1.0,1.0,0.0,0.0,0.706004,0.5,0.0,1.0,0.0,0.666667,1.0,0.333333,0.666667,1.0,1.0,0.6,0.8,0.329897,0.666667,0.5,0.472195,1.0,1.0,1.0,0.0,0.0,0.806452,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.181818,0.25,0.5,0.5,0.413686,1.0,1.0,0.0
1,492,11.798104,0.176471,0.5,0.626581,0.583449,1.0,0.5,0.0,0.666667,0.0,0.0,1.0,0.277778,0.25,1.0,0.75,0.4,0.428571,0.666667,0.511811,0.949153,0.5,1.0,0.625,0.2,0.25,0.0,0.333333,1.0,0.5,0.5,0.666667,0.25,0.333333,0.192271,0.4,0.146406,0.101884,0.336394,1.0,0.333333,1.0,0.5,0.406215,0.300242,0.0,0.468283,0.5,0.0,0.333333,0.0,0.5,0.0,0.0,0.222222,0.666667,1.0,0.6,0.8,0.670103,0.333333,0.25,0.234146,1.0,1.0,1.0,0.0,0.0,0.129032,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.636364,0.0,0.5,0.75,0.229972,0.0,0.0,0.0
2,460,11.608236,0.176471,0.5,0.56938,0.48162,1.0,0.5,1.0,0.0,0.0,0.25,1.0,0.0,1.0,1.0,0.75,0.4,0.285714,0.166667,0.464567,1.0,0.5,1.0,0.25,0.3,0.0,0.17462,0.333333,1.0,0.5,0.5,0.666667,0.25,0.166667,0.088263,1.0,0.0,0.224315,0.29591,1.0,0.333333,1.0,1.0,0.419555,0.108475,0.0,0.329899,0.5,0.0,0.333333,0.0,0.5,0.0,0.666667,0.222222,0.666667,0.5,0.6,0.4,0.608247,0.333333,0.25,0.343415,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.545455,0.75,0.5,0.75,0.14512,1.0,0.0,0.0
3,280,12.165251,0.235294,0.5,0.649939,0.601257,1.0,0.5,0.0,0.666667,0.0,0.0,1.0,0.666667,1.0,1.0,0.75,1.0,0.571429,0.333333,0.244094,0.525424,1.0,1.0,0.375,0.6,0.5,0.324295,0.333333,1.0,0.5,0.75,0.666667,0.25,0.333333,0.187023,1.0,0.0,0.328767,0.48414,1.0,1.0,1.0,1.0,0.521796,0.41937,0.0,0.594728,0.0,0.0,0.666667,0.5,0.666667,0.0,0.333333,0.555556,0.666667,0.5,0.6,0.8,0.319588,1.0,0.5,0.492683,1.0,1.0,1.0,0.72,0.485477,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.181818,0.5,0.5,0.75,0.394051,0.0,0.0,0.0
4,656,11.385092,0.823529,0.0,0.0,0.0,1.0,0.5,0.0,0.666667,0.0,0.0,1.0,0.166667,1.0,1.0,0.0,1.0,0.428571,0.333333,0.307087,0.661017,0.5,1.0,0.75,1.0,0.5,0.413232,0.333333,1.0,0.5,0.5,0.666667,0.25,0.833333,0.0,1.0,0.0,0.224743,0.219115,1.0,0.333333,1.0,1.0,0.036199,0.274576,0.0,0.280526,0.0,0.0,0.333333,0.5,0.5,0.0,0.333333,0.333333,0.666667,0.0,0.2,0.4,0.402062,0.333333,0.25,0.257561,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.181818,1.0,0.5,0.0,0.045396,0.0,0.0,0.0


In [38]:
data.shape

(146, 85)

In [39]:
data.to_csv('X_Train.csv', index=False)