In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
traindataset = pd.read_csv('HousePrice_data/train.csv')

In [3]:
traindataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
testdataset = pd.read_csv('HousePrice_data/test.csv')

In [5]:
testdataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [6]:
traindataset.shape

(1460, 81)

In [7]:
testdataset.shape

(1459, 80)

In [8]:
# categorical feature
feature_with_na_train = [feature for feature in traindataset.columns if traindataset[feature].isnull().sum()>1 and traindataset[feature].dtype == 'O']

In [9]:
feature_with_na_test = [feature for feature in testdataset.columns if testdataset[feature].isnull().sum()>1 and testdataset[feature].dtype == 'O']

In [10]:
print('Train Dataset')
for feature in feature_with_na_train:  
    print(feature, np.round(traindataset[feature].isnull().mean(),4), '% missing value')

Train Dataset
Alley 0.9377 % missing value
MasVnrType 0.0055 % missing value
BsmtQual 0.0253 % missing value
BsmtCond 0.0253 % missing value
BsmtExposure 0.026 % missing value
BsmtFinType1 0.0253 % missing value
BsmtFinType2 0.026 % missing value
FireplaceQu 0.4726 % missing value
GarageType 0.0555 % missing value
GarageFinish 0.0555 % missing value
GarageQual 0.0555 % missing value
GarageCond 0.0555 % missing value
PoolQC 0.9952 % missing value
Fence 0.8075 % missing value
MiscFeature 0.963 % missing value


In [11]:
print('Test Dataset')
for feature in feature_with_na_test:  
    print(feature, np.round(testdataset[feature].isnull().mean(),4), '% missing value')

Test Dataset
MSZoning 0.0027 % missing value
Alley 0.9267 % missing value
Utilities 0.0014 % missing value
MasVnrType 0.011 % missing value
BsmtQual 0.0302 % missing value
BsmtCond 0.0308 % missing value
BsmtExposure 0.0302 % missing value
BsmtFinType1 0.0288 % missing value
BsmtFinType2 0.0288 % missing value
Functional 0.0014 % missing value
FireplaceQu 0.5003 % missing value
GarageType 0.0521 % missing value
GarageFinish 0.0535 % missing value
GarageQual 0.0535 % missing value
GarageCond 0.0535 % missing value
PoolQC 0.9979 % missing value
Fence 0.8012 % missing value
MiscFeature 0.965 % missing value


In [12]:
def replace_cat_feature(traindataset, feature_with_na_train):
    data = traindataset.copy()
    data[feature_with_na_train] = data[feature_with_na_train].fillna('Missing')  # Nan is replaced by missing
    return data

traindataset = replace_cat_feature(traindataset, feature_with_na_train)
traindataset[feature_with_na_train].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [13]:
traindataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,0,Missing,Missing,Missing,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,0,Missing,Missing,Missing,0,12,2008,WD,Normal,250000


In [14]:
def replace_cat_feature(testdataset, feature_with_na_test):
    data = testdataset.copy()
    data[feature_with_na_test] = data[feature_with_na_test].fillna('Missing')  # Nan is replaced by missing
    return data

testdataset = replace_cat_feature(testdataset, feature_with_na_test)
testdataset[feature_with_na_test].isnull().sum()

MSZoning        0
Alley           0
Utilities       0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Functional      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [15]:
testdataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,...,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,...,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,...,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,Missing,IR1,Lvl,AllPub,...,0,0,Missing,Missing,Missing,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,Missing,IR1,HLS,AllPub,...,144,0,Missing,Missing,Missing,0,1,2010,WD,Normal


In [16]:
# numerical variables that has missing value
numerical_with_nan = [feature for feature in traindataset.columns if traindataset[feature].isnull().sum()>1 and traindataset[feature].dtype != 'O']

In [17]:
numerical_with_nan_test = [feature for feature in testdataset.columns if testdataset[feature].isnull().sum()>1 and testdataset[feature].dtype != 'O']

In [18]:
for feature in numerical_with_nan:
    print('{} : {} % missing value'.format(feature, np.round(traindataset[feature].isnull().mean(), 4)))

LotFrontage : 0.1774 % missing value
MasVnrArea : 0.0055 % missing value
GarageYrBlt : 0.0555 % missing value


In [19]:
for feature in numerical_with_nan_test:
    print('{} : {} % missing value'.format(feature, np.round(testdataset[feature].isnull().mean(), 4)))

LotFrontage : 0.1556 % missing value
MasVnrArea : 0.0103 % missing value
BsmtFullBath : 0.0014 % missing value
BsmtHalfBath : 0.0014 % missing value
GarageYrBlt : 0.0535 % missing value


In [20]:
traindataset['LotFrontage'].median()

69.0

In [21]:
## replacing NAN in numerical variables.
## there are so many outliers.
## we replave nan with mean or median.

for feature in numerical_with_nan:
    median_value = traindataset[feature].median()
    traindataset[feature + 'nan'] = np.where(traindataset[feature].isnull(), 1, 0)
    traindataset[feature].fillna(median_value, inplace=True)

traindataset[numerical_with_nan].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [22]:

for feature in numerical_with_nan_test:
    median_value = testdataset[feature].median()
    testdataset[feature + 'nan'] = np.where(testdataset[feature].isnull(), 1, 0)
    testdataset[feature].fillna(median_value, inplace=True)

testdataset[numerical_with_nan_test].isnull().sum()

LotFrontage     0
MasVnrArea      0
BsmtFullBath    0
BsmtHalfBath    0
GarageYrBlt     0
dtype: int64

In [23]:
traindataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [24]:
testdataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,...,0,6,2010,WD,Normal,0,0,0,0,0
1,1462,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,...,12500,6,2010,WD,Normal,0,0,0,0,0
2,1463,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,...,0,3,2010,WD,Normal,0,0,0,0,0
3,1464,60,RL,78.0,9978,Pave,Missing,IR1,Lvl,AllPub,...,0,6,2010,WD,Normal,0,0,0,0,0
4,1465,120,RL,43.0,5005,Pave,Missing,IR1,HLS,AllPub,...,0,1,2010,WD,Normal,0,0,0,0,0


In [25]:
# it is showing the year
traindataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,2003,2003,2003.0
1,1976,1976,1976.0
2,2001,2002,2001.0
3,1915,1970,1998.0
4,2000,2000,2000.0
...,...,...,...
1455,1999,2000,1999.0
1456,1978,1988,1978.0
1457,1941,2006,1941.0
1458,1950,1996,1950.0


In [26]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    traindataset[feature] = traindataset['YrSold'] - traindataset[feature]  
    #for each featue identify the no of years
    # if Year Built = 2019 and Yrsold = 2022  then 2022 - 2019 = 3

In [27]:
traindataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,65.0,8450,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,80.0,9600,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,68.0,11250,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,60.0,9550,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,84.0,14260,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [28]:
# after changing it is showing no of years
traindataset['YearRemodAdd']

0        5
1       31
2        6
3       36
4        8
        ..
1455     7
1456    22
1457     4
1458    14
1459    43
Name: YearRemodAdd, Length: 1460, dtype: int64

In [29]:
traindataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0
...,...,...,...
1455,8,7,8.0
1456,32,22,32.0
1457,69,4,69.0
1458,60,14,60.0


In [30]:
testdataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,1961,1961,1961.0
1,1958,1958,1958.0
2,1997,1998,1997.0
3,1998,1998,1998.0
4,1992,1992,1992.0
...,...,...,...
1454,1970,1970,1979.0
1455,1970,1970,1970.0
1456,1960,1996,1960.0
1457,1992,1992,1979.0


In [31]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    testdataset[feature] = testdataset['YrSold'] - testdataset[feature]  

In [32]:
testdataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']]

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,49,49,49.0
1,52,52,52.0
2,13,12,13.0
3,12,12,12.0
4,18,18,18.0
...,...,...,...
1454,36,36,27.0
1455,36,36,36.0
1456,46,10,46.0
1457,14,14,27.0


In [33]:
## Numerical values were skewed. we need to handle these skewed data.

num_features = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

In [34]:
for feature in num_features:
    traindataset[feature] = np.log(traindataset[feature])


In [35]:
traindataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,4.174387,9.041922,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,12.247694,0,0,0
1,2,20,RL,4.382027,9.169518,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,12.109011,0,0,0
2,3,60,RL,4.219508,9.328123,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,12.317167,0,0,0
3,4,70,RL,4.094345,9.164296,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,11.849398,0,0,0
4,5,60,RL,4.430817,9.565214,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,12.429216,0,0,0


In [36]:
num_features_test = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
for feature in num_features_test:
    testdataset[feature] = np.log(testdataset[feature])

In [37]:
testdataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,20,RH,4.382027,9.360655,Pave,Missing,Reg,Lvl,AllPub,...,0,6,2010,WD,Normal,0,0,0,0,0
1,1462,20,RL,4.394449,9.565704,Pave,Missing,IR1,Lvl,AllPub,...,12500,6,2010,WD,Normal,0,0,0,0,0
2,1463,60,RL,4.304065,9.534595,Pave,Missing,IR1,Lvl,AllPub,...,0,3,2010,WD,Normal,0,0,0,0,0
3,1464,60,RL,4.356709,9.208138,Pave,Missing,IR1,Lvl,AllPub,...,0,6,2010,WD,Normal,0,0,0,0,0
4,1465,120,RL,3.7612,8.518193,Pave,Missing,IR1,HLS,AllPub,...,0,1,2010,WD,Normal,0,0,0,0,0


#### Rare Categorical Feature

##### we will remove categorical variables are present less than 1% of the observation

In [38]:
categorical_feature = [feature for feature in traindataset.columns if traindataset[feature].dtype == 'O']

In [39]:
categorical_feature

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [40]:
categorical_feature_test = [feature for feature in testdataset.columns if testdataset[feature].dtype == 'O']

In [41]:
categorical_feature_test

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [42]:
for feature in categorical_feature:
    print('The feature is : {} , no of categories are : {}'.format(feature, len(traindataset[feature].unique())))

The feature is : MSZoning , no of categories are : 5
The feature is : Street , no of categories are : 2
The feature is : Alley , no of categories are : 3
The feature is : LotShape , no of categories are : 4
The feature is : LandContour , no of categories are : 4
The feature is : Utilities , no of categories are : 2
The feature is : LotConfig , no of categories are : 5
The feature is : LandSlope , no of categories are : 3
The feature is : Neighborhood , no of categories are : 25
The feature is : Condition1 , no of categories are : 9
The feature is : Condition2 , no of categories are : 8
The feature is : BldgType , no of categories are : 5
The feature is : HouseStyle , no of categories are : 8
The feature is : RoofStyle , no of categories are : 6
The feature is : RoofMatl , no of categories are : 8
The feature is : Exterior1st , no of categories are : 15
The feature is : Exterior2nd , no of categories are : 16
The feature is : MasVnrType , no of categories are : 5
The feature is : ExterQ

In [43]:
for feature in categorical_feature_test:
    print('The feature is : {} , no of categories are : {}'.format(feature, len(testdataset[feature].unique())))

The feature is : MSZoning , no of categories are : 6
The feature is : Street , no of categories are : 2
The feature is : Alley , no of categories are : 3
The feature is : LotShape , no of categories are : 4
The feature is : LandContour , no of categories are : 4
The feature is : Utilities , no of categories are : 2
The feature is : LotConfig , no of categories are : 5
The feature is : LandSlope , no of categories are : 3
The feature is : Neighborhood , no of categories are : 25
The feature is : Condition1 , no of categories are : 9
The feature is : Condition2 , no of categories are : 5
The feature is : BldgType , no of categories are : 5
The feature is : HouseStyle , no of categories are : 7
The feature is : RoofStyle , no of categories are : 6
The feature is : RoofMatl , no of categories are : 4
The feature is : Exterior1st , no of categories are : 14
The feature is : Exterior2nd , no of categories are : 16
The feature is : MasVnrType , no of categories are : 5
The feature is : ExterQ

In [44]:
len(categorical_feature)

43

In [45]:
traindataset['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

In [46]:
for feature in categorical_feature:
    temp = traindataset.groupby(feature)['SalePrice'].count()/len(traindataset)
    temp_df = temp[temp > 0.01].index
    traindataset[feature] = np.where(traindataset[feature].isin(temp_df), traindataset[feature], 'Rare_var')

In [47]:
for feature in categorical_feature_test:
    temp_test = testdataset.groupby(feature).count()/len(testdataset)
    temp_df_test = temp[temp > 0.01].index
    testdataset[feature] = np.where(testdataset[feature].isin(temp_df), testdataset[feature], 'Rare_var')

In [48]:
testdataset.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,20,Rare_var,4.382027,9.360655,Rare_var,Rare_var,Rare_var,Rare_var,Rare_var,...,0,6,2010,Rare_var,Normal,0,0,0,0,0
1,1462,20,Rare_var,4.394449,9.565704,Rare_var,Rare_var,Rare_var,Rare_var,Rare_var,...,12500,6,2010,Rare_var,Normal,0,0,0,0,0
2,1463,60,Rare_var,4.304065,9.534595,Rare_var,Rare_var,Rare_var,Rare_var,Rare_var,...,0,3,2010,Rare_var,Normal,0,0,0,0,0
3,1464,60,Rare_var,4.356709,9.208138,Rare_var,Rare_var,Rare_var,Rare_var,Rare_var,...,0,6,2010,Rare_var,Normal,0,0,0,0,0
4,1465,120,Rare_var,3.7612,8.518193,Rare_var,Rare_var,Rare_var,Rare_var,Rare_var,...,0,1,2010,Rare_var,Normal,0,0,0,0,0


In [49]:
traindataset.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,4.174387,9.041922,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,12.247694,0,0,0
1,2,20,RL,4.382027,9.169518,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,12.109011,0,0,0
2,3,60,RL,4.219508,9.328123,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,12.317167,0,0,0
3,4,70,RL,4.094345,9.164296,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,11.849398,0,0,0
4,5,60,RL,4.430817,9.565214,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,12.429216,0,0,0


In [50]:
for feature in categorical_feature:
    print('The feature is : {} , no of categories are : {}'.format(feature, len(traindataset[feature].unique())))

The feature is : MSZoning , no of categories are : 5
The feature is : Street , no of categories are : 2
The feature is : Alley , no of categories are : 3
The feature is : LotShape , no of categories are : 4
The feature is : LandContour , no of categories are : 4
The feature is : Utilities , no of categories are : 2
The feature is : LotConfig , no of categories are : 5
The feature is : LandSlope , no of categories are : 3
The feature is : Neighborhood , no of categories are : 23
The feature is : Condition1 , no of categories are : 6
The feature is : Condition2 , no of categories are : 2
The feature is : BldgType , no of categories are : 5
The feature is : HouseStyle , no of categories are : 6
The feature is : RoofStyle , no of categories are : 3
The feature is : RoofMatl , no of categories are : 2
The feature is : Exterior1st , no of categories are : 11
The feature is : Exterior2nd , no of categories are : 11
The feature is : MasVnrType , no of categories are : 5
The feature is : ExterQ

In [51]:
traindataset['MSZoning'].unique()

array(['RL', 'RM', 'Rare_var', 'FV', 'RH'], dtype=object)

In [52]:
for feature in categorical_feature:
    labels_ordered=traindataset.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    traindataset[feature]=traindataset[feature].map(labels_ordered)

In [53]:
traindataset.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,3,4.174387,9.041922,1,2,0,1,1,...,2,0,2,2008,2,3,12.247694,0,0,0
1,2,20,3,4.382027,9.169518,1,2,0,1,1,...,2,0,5,2007,2,3,12.109011,0,0,0
2,3,60,3,4.219508,9.328123,1,2,1,1,1,...,2,0,9,2008,2,3,12.317167,0,0,0
3,4,70,3,4.094345,9.164296,1,2,1,1,1,...,2,0,2,2006,2,0,11.849398,0,0,0
4,5,60,3,4.430817,9.565214,1,2,1,1,1,...,2,0,12,2008,2,3,12.429216,0,0,0
5,6,50,3,4.442651,9.554993,1,2,1,1,1,...,1,700,10,2009,2,3,11.8706,0,0,0
6,7,20,3,4.317488,9.218705,1,2,0,1,1,...,2,0,8,2007,2,3,12.634603,0,0,0
7,8,60,3,4.234107,9.247829,1,2,1,1,1,...,1,350,11,2009,2,3,12.206073,1,0,0
8,9,50,1,3.931826,8.719317,1,2,0,1,1,...,2,0,4,2008,2,0,11.77452,0,0,0
9,10,190,3,3.912023,8.911934,1,2,0,1,1,...,2,0,1,2008,2,3,11.67844,0,0,0


In [54]:
for feature in categorical_feature_test:
    labels_ordered=testdataset.groupby([feature]).mean().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    testdataset[feature]=testdataset[feature].map(labels_ordered)

In [55]:
testdataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,20,0,4.382027,9.360655,0,0,0,0,0,...,0,6,2010,0,2,0,0,0,0,0
1,1462,20,0,4.394449,9.565704,0,0,0,0,0,...,12500,6,2010,0,2,0,0,0,0,0
2,1463,60,0,4.304065,9.534595,0,0,0,0,0,...,0,3,2010,0,2,0,0,0,0,0
3,1464,60,0,4.356709,9.208138,0,0,0,0,0,...,0,6,2010,0,2,0,0,0,0,0
4,1465,120,0,3.7612,8.518193,0,0,0,0,0,...,0,1,2010,0,2,0,0,0,0,0


In [56]:
# feature scaling
# each feature are measured in different units, we need to scale them to same measure to apply ML algo's

feature_scale = [ feature for feature in traindataset.columns if feature not in ['Id', 'SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(traindataset[feature_scale])

MinMaxScaler()

In [57]:
feature_scale_test = [ feature for feature in testdataset.columns if feature not in ['Id']]

from sklearn.preprocessing import MinMaxScaler
scaler_test = MinMaxScaler()
scaler_test.fit(testdataset[feature_scale_test])

MinMaxScaler()

In [58]:
## Min max scaler will scale down the data to 0, 1

x_train = pd.concat([traindataset[['Id', 'SalePrice']].reset_index(drop = True), pd.DataFrame(scaler.transform(traindataset[feature_scale]),
           columns=feature_scale)], axis=1)

In [59]:
x_train.head(10)

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.090909,0.5,0.666667,0.75,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.727273,0.5,0.666667,0.75,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,0.090909,0.0,0.666667,0.0,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,...,1.0,1.0,0.0,1.0,0.5,0.666667,0.75,0.0,0.0,0.0
5,6,11.8706,0.176471,0.75,0.517503,0.466761,1.0,1.0,0.333333,0.333333,...,0.5,0.5,0.045161,0.818182,0.75,0.666667,0.75,0.0,0.0,0.0
6,7,12.634603,0.0,0.75,0.471175,0.400943,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.636364,0.25,0.666667,0.75,0.0,0.0,0.0
7,8,12.206073,0.235294,0.75,0.440313,0.406643,1.0,1.0,0.333333,0.333333,...,1.0,0.5,0.022581,0.909091,0.75,0.666667,0.75,1.0,0.0,0.0
8,9,11.77452,0.176471,0.25,0.328426,0.303205,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.272727,0.5,0.666667,0.0,0.0,0.0,0.0
9,10,11.67844,1.0,0.75,0.321097,0.340903,1.0,1.0,0.0,0.333333,...,1.0,1.0,0.0,0.0,0.5,0.666667,0.75,0.0,0.0,0.0


In [60]:
x_test = pd.concat([testdataset[['Id']].reset_index(drop = True), pd.DataFrame(scaler_test.transform(testdataset[feature_scale_test]),
           columns=feature_scale_test)], axis=1)

In [61]:
x_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,1461,0.0,0.0,0.593445,0.56636,0.0,0.0,0.0,0.0,0.0,...,0.0,0.454545,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,1462,0.0,0.0,0.598957,0.622527,0.0,0.0,0.0,0.0,0.0,...,0.735294,0.454545,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.0,0.558854,0.614005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.181818,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.0,0.582212,0.524583,0.0,0.0,0.0,0.0,0.0,...,0.0,0.454545,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.0,0.317987,0.335596,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [62]:
x_train.to_csv('X_train.csv', index=False)

In [63]:
x_test.to_csv('X_test.csv', index=False)