In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
my_colors = ['Red' ,'Orange' , 'Yellow' , 'Green' , 'Blue' , 'Indigo' , 'Violet']

In [2]:
dataset=pd.read_csv('train.csv')

In [3]:
pd.pandas.set_option('display.max_columns',None)

In [4]:
features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtype=='O']
for feature in features_nan:
    print('{} : {} % missing values'.format(feature,np.round(dataset[feature].isnull().mean(),4)))

Alley : 0.9377 % missing values
MasVnrType : 0.0055 % missing values
BsmtQual : 0.0253 % missing values
BsmtCond : 0.0253 % missing values
BsmtExposure : 0.026 % missing values
BsmtFinType1 : 0.0253 % missing values
BsmtFinType2 : 0.026 % missing values
FireplaceQu : 0.4726 % missing values
GarageType : 0.0555 % missing values
GarageFinish : 0.0555 % missing values
GarageQual : 0.0555 % missing values
GarageCond : 0.0555 % missing values
PoolQC : 0.9952 % missing values
Fence : 0.8075 % missing values
MiscFeature : 0.963 % missing values


In [5]:
def replace_cat_feature(dataset,features_nan):
    data=dataset.copy()
    data[features_nan]=data[features_nan].fillna('Missing')
    return data

In [6]:
dataset=replace_cat_feature(dataset,features_nan)
dataset[features_nan].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [7]:
numrical_features_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtype!='O']
numrical_features_nan
for feature in numrical_features_nan:
    print('{} % missing value of {}_feature'.format(np.round(dataset[feature].isnull().mean(),4),feature))

0.1774 % missing value of LotFrontage_feature
0.0055 % missing value of MasVnrArea_feature
0.0555 % missing value of GarageYrBlt_feature


In [8]:
for feature in numrical_features_nan:
    median_value=dataset[feature].median()
    dataset[feature+'nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)

dataset[numrical_features_nan].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [9]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    dataset[feature]=dataset['YrSold']-dataset[feature]

In [10]:
dataset[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


In [11]:
num_features=['LotFrontage','LotArea','1stFlrSF','GrLivArea']

for feature in num_features:
    dataset[feature]=np.log(dataset[feature])

In [12]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtypes=='O']
categorical_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [13]:
for feature in categorical_features:
    temp=dataset.groupby(feature).count()/len(dataset)
    temp_df=temp[temp>0.01].index
#     print(temp_df)
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],"RARE_VAR")


In [14]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,4.174387,9.041922,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,5,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,6.75227,854,0,7.444249,1,0,2,1,3,1,Gd,8,Typ,0,Missing,Attchd,5.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,Missing,Missing,Missing,0,2,2008,WD,Normal,208500,0,0,0
1,2,20,RL,4.382027,9.169518,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,31,31,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,7.140453,0,0,7.140453,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,31.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,Missing,Missing,Missing,0,5,2007,WD,Normal,181500,0,0,0
2,3,60,RL,4.219508,9.328123,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,7,6,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,6.824374,866,0,7.487734,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,7.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,Missing,Missing,Missing,0,9,2008,WD,Normal,223500,0,0,0
3,4,70,RL,4.094345,9.164296,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,91,36,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,6.867974,756,0,7.448334,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,8.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,Missing,Missing,Missing,0,2,2006,WD,Abnorml,140000,0,0,0
4,5,60,RL,4.430817,9.565214,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,8,8,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,7.04316,1053,0,7.695303,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,8.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,Missing,Missing,Missing,0,12,2008,WD,Normal,250000,0,0,0


In [17]:
for feature in categorical_features:
    labels_ordered=dataset.groupby(feature)["Id"].mean().sort_values().index
#     print(labels_ordered)
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
#     print(labels_ordered)
    dataset[feature]=dataset[feature].map(labels_ordered)
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,3,4.174387,9.041922,1,1,2,1,0,2,1,9,4,5,2,2,7,5,5,5,3,4,5,6,1,196.0,3,2,0,3,4,0,3,706,4,0,150,856,1,2,1,4,6.75227,854,0,7.444249,1,0,2,1,3,1,3,8,4,0,4,4,5.0,3,2,548,3,3,1,0,61,0,0,0,0,1,1,1,0,2,2008,6,3,208500,0,0,0
1,2,20,3,4.382027,9.169518,1,1,2,1,0,3,1,3,7,5,2,4,6,8,31,31,3,4,9,8,2,0.0,2,2,1,3,4,1,1,978,4,0,284,1262,1,2,1,4,7.140453,0,0,7.140453,0,1,2,0,3,1,2,6,4,1,3,4,31.0,3,2,460,3,3,1,298,0,0,0,0,0,1,1,1,0,5,2007,6,3,181500,0,0,0
2,3,60,3,4.219508,9.328123,1,1,1,1,0,2,1,9,4,5,2,2,7,5,7,6,3,4,5,6,1,162.0,3,2,0,3,4,4,3,486,4,0,434,920,1,2,1,4,6.824374,866,0,7.487734,1,0,2,1,3,1,3,6,4,1,3,4,7.0,3,2,608,3,3,1,0,42,0,0,0,0,1,1,1,0,9,2008,6,3,223500,0,0,0
3,4,70,3,4.094345,9.164296,1,1,1,1,0,0,1,21,4,5,2,2,7,5,91,36,3,4,2,7,2,0.0,2,2,2,4,3,0,1,216,4,0,540,756,1,1,1,4,6.867974,756,0,7.448334,1,0,1,0,3,1,3,7,4,1,5,5,8.0,1,3,642,3,3,1,0,35,272,0,0,0,1,1,1,0,2,2006,6,4,140000,0,0,0
4,5,60,3,4.430817,9.565214,1,1,1,1,0,3,1,22,4,5,2,2,8,5,8,8,3,4,5,6,1,350.0,3,2,0,3,4,3,3,655,4,0,490,1145,1,2,1,4,7.04316,1053,0,7.695303,1,0,2,1,4,1,3,9,4,1,3,4,8.0,3,3,836,3,3,1,192,84,0,0,0,0,1,1,1,0,12,2008,6,3,250000,0,0,0


In [18]:
feature_Scale=[feature for feature in dataset.columns if feature not in ['Id']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_Scale])

In [19]:
scaler.transform(dataset[feature_Scale])

array([[0.23529412, 0.75      , 0.41820812, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.49506375, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29411765, 0.75      , 0.42385922, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.47117546, ..., 0.        , 0.        ,
        0.        ]])

In [20]:
data=pd.concat([dataset[["Id"]].reset_index(drop=True),pd.DataFrame(scaler.transform(dataset[feature_Scale]),columns=feature_Scale)],axis=1)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,0.235294,0.75,0.418208,0.366344,1.0,0.5,0.666667,0.333333,0.0,0.5,0.5,0.375,0.5,0.714286,0.5,0.285714,0.666667,0.5,0.036765,0.098361,0.6,0.571429,0.357143,0.4,0.25,0.1225,1.0,0.5,0.0,0.75,1.0,0.0,0.5,0.125089,0.666667,0.0,0.064212,0.140098,0.2,0.5,1.0,0.8,0.356155,0.413559,0.0,0.577712,0.333333,0.0,0.666667,0.5,0.375,0.333333,1.0,0.5,0.666667,0.0,0.8,0.666667,0.046729,1.0,0.5,0.38646,0.6,0.6,0.5,0.0,0.111517,0.0,0.0,0.0,0.0,0.333333,0.25,0.25,0.0,0.090909,0.5,0.75,0.6,0.241078,0.0,0.0,0.0
1,2,0.0,0.75,0.495064,0.391317,1.0,0.5,0.666667,0.333333,0.0,0.75,0.5,0.125,0.875,0.714286,0.5,0.571429,0.555556,0.875,0.227941,0.52459,0.6,0.571429,0.642857,0.533333,0.5,0.0,0.666667,0.5,0.2,0.75,1.0,0.25,0.166667,0.173281,0.666667,0.0,0.121575,0.206547,0.2,0.5,1.0,0.8,0.503056,0.0,0.0,0.470245,0.0,0.5,0.666667,0.0,0.375,0.333333,0.666667,0.333333,0.666667,0.333333,0.6,0.666667,0.28972,1.0,0.5,0.324401,0.6,0.6,0.5,0.347725,0.0,0.0,0.0,0.0,0.0,0.333333,0.25,0.25,0.0,0.363636,0.25,0.75,0.6,0.203583,0.0,0.0,0.0
2,3,0.235294,0.75,0.434909,0.422359,1.0,0.5,0.333333,0.333333,0.0,0.5,0.5,0.375,0.5,0.714286,0.5,0.285714,0.666667,0.5,0.051471,0.114754,0.6,0.571429,0.357143,0.4,0.25,0.10125,1.0,0.5,0.0,0.75,1.0,1.0,0.5,0.086109,0.666667,0.0,0.185788,0.150573,0.2,0.5,1.0,0.8,0.383441,0.41937,0.0,0.593095,0.333333,0.0,0.666667,0.5,0.375,0.333333,1.0,0.333333,0.666667,0.333333,0.6,0.666667,0.065421,1.0,0.5,0.428773,0.6,0.6,0.5,0.0,0.076782,0.0,0.0,0.0,0.0,0.333333,0.25,0.25,0.0,0.727273,0.5,0.75,0.6,0.261908,0.0,0.0,0.0
3,4,0.294118,0.75,0.388581,0.390295,1.0,0.5,0.333333,0.333333,0.0,0.0,0.5,0.875,0.5,0.714286,0.5,0.285714,0.666667,0.5,0.669118,0.606557,0.6,0.571429,0.142857,0.466667,0.5,0.0,0.666667,0.5,0.4,1.0,0.75,0.0,0.166667,0.038271,0.666667,0.0,0.231164,0.123732,0.2,0.25,1.0,0.8,0.399941,0.366102,0.0,0.579157,0.333333,0.0,0.333333,0.0,0.375,0.333333,1.0,0.416667,0.666667,0.333333,1.0,0.833333,0.074766,0.333333,0.75,0.45275,0.6,0.6,0.5,0.0,0.063985,0.492754,0.0,0.0,0.0,0.333333,0.25,0.25,0.0,0.090909,0.0,0.75,0.8,0.145952,0.0,0.0,0.0
4,5,0.235294,0.75,0.513123,0.468761,1.0,0.5,0.333333,0.333333,0.0,0.75,0.5,0.916667,0.5,0.714286,0.5,0.285714,0.777778,0.5,0.058824,0.147541,0.6,0.571429,0.357143,0.4,0.25,0.21875,1.0,0.5,0.0,0.75,1.0,0.75,0.5,0.116052,0.666667,0.0,0.20976,0.187398,0.2,0.5,1.0,0.8,0.466237,0.509927,0.0,0.666523,0.333333,0.0,0.666667,0.5,0.5,0.333333,1.0,0.583333,0.666667,0.333333,0.6,0.666667,0.074766,1.0,0.75,0.589563,0.6,0.6,0.5,0.224037,0.153565,0.0,0.0,0.0,0.0,0.333333,0.25,0.25,0.0,1.0,0.5,0.75,0.6,0.298709,0.0,0.0,0.0


In [22]:
data.to_csv('X_test.csv',index=False)