In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_train=pd.read_csv("train.csv")

In [70]:
train_data=df_train.copy()

In [71]:
train_data.shape

(1460, 81)

In [72]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [73]:
na_cols=[features for features in train_data.columns if train_data[features].isnull().sum()>0]
na_cols

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [74]:
drop_col=[col for col in na_cols if (train_data[col].isnull().sum()/1460*100)>10]
drop_col
#Identifying columns having null values>10%

['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

In [75]:
train_data.drop(drop_col,axis=1,inplace=True)
# removing columns having null values>10%

In [76]:
train_data.drop(['Id'],axis=1,inplace=True)

# Removing ID column

In [77]:
train_data.shape

(1460, 74)

In [78]:
year_col=[feature for feature in train_data.columns if 'Yr' in feature or 'Year' in feature]
year_col

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [79]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    train_data[feature]=train_data['YrSold']-train_data[feature]
    
# Considering year difference between sold and any modifications in the house. eg. House age 

In [80]:
train_data[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


In [81]:
train_data.drop(['YrSold'],axis=1,inplace=True)

#Year sold not relevant as we have already extracted information from it

In [82]:
train_data[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].isnull().sum()

YearBuilt        0
YearRemodAdd     0
GarageYrBlt     81
dtype: int64

In [83]:
train_data['GarageYrBlt']=train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].median())

In [84]:
train_data[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].isnull().sum()

YearBuilt       0
YearRemodAdd    0
GarageYrBlt     0
dtype: int64

In [85]:
train_data.shape

(1460, 73)

In [86]:
catg_feature=[feature for feature in train_data.columns if train_data[feature].dtypes=='O']
catg_feature

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [87]:
na_catg=[feature for feature in catg_feature if train_data[feature].isnull().sum()>0]
na_catg

['MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

In [88]:
for catg in na_catg:
    train_data[catg]=train_data[catg].fillna(train_data[catg].mode()[0])
    
    
# for filling na with mode

In [89]:
train_data[na_catg].isnull().sum()

MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
dtype: int64

In [90]:
num_feature=[feature for feature in train_data.columns if train_data[feature].dtypes!='O' and feature not in year_col]
num_feature

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'SalePrice']

In [91]:
na_num=[feature for feature in num_feature if train_data[feature].isnull().sum()>0]
na_num

['MasVnrArea']

In [92]:
for catg in na_num:
    train_data[catg]=train_data[catg].fillna(train_data[catg].median())
    
    
# filling na with median (Found outliers while analysing the data)

In [93]:
train_data.isnull().sum().sort_values(ascending=False)

SalePrice       0
TotalBsmtSF     0
RoofMatl        0
Exterior1st     0
Exterior2nd     0
               ..
BedroomAbvGr    0
KitchenAbvGr    0
KitchenQual     0
TotRmsAbvGrd    0
MSSubClass      0
Length: 73, dtype: int64

In [94]:
train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,WD,Normal,250000


In [95]:
# Log normal distribution for continuous data

continuous_feature=['LotArea', '1stFlrSF', 'GrLivArea']
for feature in continuous_feature:
    train_data[feature]=np.log(train_data[feature])

In [96]:
train_data[continuous_feature].head()

Unnamed: 0,LotArea,1stFlrSF,GrLivArea
0,9.041922,6.75227,7.444249
1,9.169518,7.140453,7.140453
2,9.328123,6.824374,7.487734
3,9.164296,6.867974,7.448334
4,9.565214,7.04316,7.695303


In [97]:
train_data.isnull().sum().sort_values(ascending=False)

SalePrice       0
TotalBsmtSF     0
RoofMatl        0
Exterior1st     0
Exterior2nd     0
               ..
BedroomAbvGr    0
KitchenAbvGr    0
KitchenQual     0
TotRmsAbvGrd    0
MSSubClass      0
Length: 73, dtype: int64

In [98]:
train_data.shape

(1460, 73)

In [99]:
df_test=pd.read_csv('testv1.csv')

In [100]:
df_test.shape

(1459, 72)

In [101]:
df_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,SaleType,SaleCondition
0,20,RH,9.360655,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,140,0,0,0,120,0,0,6,WD,Normal
1,20,RL,9.565704,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,393,36,0,0,0,0,12500,6,WD,Normal
2,60,RL,9.534595,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,212,34,0,0,0,0,0,3,WD,Normal
3,60,RL,9.208138,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,360,36,0,0,0,0,0,6,WD,Normal
4,120,RL,8.518193,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,...,0,82,0,0,144,0,0,1,WD,Normal


In [102]:
train_test=pd.concat([train_data,df_test],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [103]:
train_test.shape

(2919, 73)

In [104]:
train_test.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SalePrice,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd
0,6.75227,854,0,3,1Fam,TA,No,706.0,0.0,GLQ,...,208500.0,WD,0,Pave,8,856.0,AllPub,0,5,5
1,7.140453,0,0,3,1Fam,TA,Gd,978.0,0.0,ALQ,...,181500.0,WD,0,Pave,6,1262.0,AllPub,298,31,31
2,6.824374,866,0,3,1Fam,TA,Mn,486.0,0.0,GLQ,...,223500.0,WD,0,Pave,6,920.0,AllPub,0,7,6
3,6.867974,756,0,3,1Fam,Gd,No,216.0,0.0,ALQ,...,140000.0,WD,0,Pave,7,756.0,AllPub,0,91,36
4,7.04316,1053,0,4,1Fam,TA,Av,655.0,0.0,GLQ,...,250000.0,WD,0,Pave,9,1145.0,AllPub,192,8,8


In [105]:
categorical_column=[column for column in train_test.columns if train_test[column].dtypes=='object']
categorical_column

['BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'Neighborhood',
 'PavedDrive',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities']

In [106]:
catg_dummy=pd.get_dummies(train_test[categorical_column],drop_first=True)

In [107]:
train_test=pd.concat([train_test,catg_dummy],axis=1)

In [108]:
train_test.shape

(2919, 269)

In [109]:
train_test.drop(categorical_column,axis=1,inplace=True)

In [110]:
train_test.shape

(2919, 231)

In [111]:
train_test.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,6.75227,854,0,3,706.0,0.0,1.0,0.0,150.0,0,...,0,0,0,0,0,0,0,1,1,0
1,7.140453,0,0,3,978.0,0.0,0.0,1.0,284.0,0,...,0,0,0,0,0,0,0,1,1,0
2,6.824374,866,0,3,486.0,0.0,1.0,0.0,434.0,0,...,0,0,0,0,0,0,0,1,1,0
3,6.867974,756,0,3,216.0,0.0,1.0,0.0,540.0,272,...,0,0,0,0,0,0,0,1,1,0
4,7.04316,1053,0,4,655.0,0.0,1.0,0.0,490.0,0,...,0,0,0,0,0,0,0,1,1,0


In [112]:
X=train_test.drop(['SalePrice'],axis=1)
y=train_test['SalePrice']

In [135]:
y=train_test[['SalePrice']]

In [136]:
y.head()

Unnamed: 0,SalePrice
0,208500.0
1,181500.0
2,223500.0
3,140000.0
4,250000.0


In [113]:
X.shape

(2919, 230)

In [114]:
X.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,6.75227,854,0,3,706.0,0.0,1.0,0.0,150.0,0,...,0,0,0,0,0,0,0,1,1,0
1,7.140453,0,0,3,978.0,0.0,0.0,1.0,284.0,0,...,0,0,0,0,0,0,0,1,1,0
2,6.824374,866,0,3,486.0,0.0,1.0,0.0,434.0,0,...,0,0,0,0,0,0,0,1,1,0
3,6.867974,756,0,3,216.0,0.0,1.0,0.0,540.0,272,...,0,0,0,0,0,0,0,1,1,0
4,7.04316,1053,0,4,655.0,0.0,1.0,0.0,490.0,0,...,0,0,0,0,0,0,0,1,1,0


In [116]:
X_train=X.iloc[:1460]
X_test=X.iloc[1460:]

In [137]:
y_train=y.iloc[:1460]

In [121]:
Xtrain_column=[column for column in X_train.columns]
Xtrain_column

['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'BedroomAbvGr',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'EnclosedPorch',
 'Fireplaces',
 'FullBath',
 'GarageArea',
 'GarageCars',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'KitchenAbvGr',
 'LotArea',
 'LowQualFinSF',
 'MSSubClass',
 'MasVnrArea',
 'MiscVal',
 'MoSold',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PoolArea',
 'ScreenPorch',
 'TotRmsAbvGrd',
 'TotalBsmtSF',
 'WoodDeckSF',
 'YearBuilt',
 'YearRemodAdd',
 'BldgType_2fmCon',
 'BldgType_Duplex',
 'BldgType_Twnhs',
 'BldgType_TwnhsE',
 'BsmtCond_Gd',
 'BsmtCond_Po',
 'BsmtCond_TA',
 'BsmtExposure_Gd',
 'BsmtExposure_Mn',
 'BsmtExposure_No',
 'BsmtFinType1_BLQ',
 'BsmtFinType1_GLQ',
 'BsmtFinType1_LwQ',
 'BsmtFinType1_Rec',
 'BsmtFinType1_Unf',
 'BsmtFinType2_BLQ',
 'BsmtFinType2_GLQ',
 'BsmtFinType2_LwQ',
 'BsmtFinType2_Rec',
 'BsmtFinType2_Unf',
 'BsmtQual_Fa',
 'BsmtQual_Gd',
 'BsmtQual_TA',
 'CentralAir_Y',
 'Condition1_Feedr',
 'Condit

In [122]:
Xtest_column=[column for column in X_test.columns]
Xtest_column

['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'BedroomAbvGr',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'EnclosedPorch',
 'Fireplaces',
 'FullBath',
 'GarageArea',
 'GarageCars',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'KitchenAbvGr',
 'LotArea',
 'LowQualFinSF',
 'MSSubClass',
 'MasVnrArea',
 'MiscVal',
 'MoSold',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PoolArea',
 'ScreenPorch',
 'TotRmsAbvGrd',
 'TotalBsmtSF',
 'WoodDeckSF',
 'YearBuilt',
 'YearRemodAdd',
 'BldgType_2fmCon',
 'BldgType_Duplex',
 'BldgType_Twnhs',
 'BldgType_TwnhsE',
 'BsmtCond_Gd',
 'BsmtCond_Po',
 'BsmtCond_TA',
 'BsmtExposure_Gd',
 'BsmtExposure_Mn',
 'BsmtExposure_No',
 'BsmtFinType1_BLQ',
 'BsmtFinType1_GLQ',
 'BsmtFinType1_LwQ',
 'BsmtFinType1_Rec',
 'BsmtFinType1_Unf',
 'BsmtFinType2_BLQ',
 'BsmtFinType2_GLQ',
 'BsmtFinType2_LwQ',
 'BsmtFinType2_Rec',
 'BsmtFinType2_Unf',
 'BsmtQual_Fa',
 'BsmtQual_Gd',
 'BsmtQual_TA',
 'CentralAir_Y',
 'Condition1_Feedr',
 'Condit

In [123]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [127]:
X_train=pd.DataFrame(X_train,columns=Xtrain_column)
X_train

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,0.356155,0.413559,0.0,0.375,0.125089,0.000000,0.333333,0.0,0.064212,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.503056,0.000000,0.0,0.375,0.173281,0.000000,0.000000,0.5,0.121575,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.383441,0.419370,0.0,0.375,0.086109,0.000000,0.333333,0.0,0.185788,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.399941,0.366102,0.0,0.375,0.038271,0.000000,0.333333,0.0,0.231164,0.492754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.466237,0.509927,0.0,0.500,0.116052,0.000000,0.333333,0.0,0.209760,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.396777,0.336077,0.0,0.375,0.000000,0.000000,0.000000,0.0,0.407962,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1456,0.690872,0.000000,0.0,0.375,0.139972,0.110583,0.333333,0.0,0.252140,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1457,0.480189,0.557869,0.0,0.500,0.048724,0.000000,0.000000,0.0,0.375428,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1458,0.443419,0.000000,0.0,0.250,0.008682,0.698100,0.333333,0.0,0.000000,0.202899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [128]:
X_test=pd.DataFrame(X_test,columns=Xtest_column)
X_test

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Pave,Utilities_NoSeWa
0,0.373438,0.000000,0.0,0.250,0.082920,0.097693,0.000000,0.0,0.115582,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.522632,0.000000,0.0,0.375,0.163536,0.000000,0.000000,0.0,0.173801,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.386718,0.339467,0.0,0.375,0.140149,0.000000,0.000000,0.0,0.058647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.385901,0.328329,0.0,0.375,0.106662,0.000000,0.000000,0.0,0.138699,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.508416,0.000000,0.0,0.250,0.046598,0.000000,0.000000,0.0,0.435360,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.185992,0.264407,0.0,0.375,0.000000,0.000000,0.000000,0.0,0.233733,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1455,0.185992,0.264407,0.0,0.375,0.044649,0.000000,0.000000,0.0,0.125856,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1456,0.491486,0.000000,0.0,0.500,0.216867,0.000000,0.333333,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1457,0.403469,0.000000,0.0,0.375,0.059709,0.000000,0.000000,0.5,0.246147,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [130]:
X_train.to_csv('X_train.csv',index=False)
X_test.to_csv('X_test.csv',index=False)

In [139]:
y_train.to_csv('y_train.csv',index=False)

In [143]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [160]:
best_feat=SelectFromModel(Lasso(alpha=0.005,random_state=0))
best_feat.fit(X_train,np.log(y_train))

y_train=np.log(y_train)
## skipped transforming 'SalePrice'(y_train) earlier

In [161]:
y_train

Unnamed: 0,SalePrice
0,12.247694
1,12.109011
2,12.317167
3,11.849398
4,12.429216
...,...
1455,12.072541
1456,12.254863
1457,12.493130
1458,11.864462


In [154]:
best_feat.get_support()

array([ True, False, False, False, False, False, False, False, False,
       False,  True, False, False,  True, False,  True, False, False,
       False, False,  True, False, False, False, False, False,  True,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False,  True, False,
        True, False, False,  True, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [155]:
select_feat=X_train.columns[(best_feat.get_support())]

print('Total feat',':',X_train.shape[1])
print('selected feat :',len(select_feat))
print('feature with coeff 0 :',np.sum(best_feat.estimator_.coef_==0))

Total feat : 230
selected feat : 28
feature with coeff 0 : 202


In [156]:
select_feat

Index(['1stFlrSF', 'Fireplaces', 'GarageCars', 'GrLivArea', 'MSSubClass',
       'OverallQual', 'YearRemodAdd', 'BsmtExposure_Gd', 'BsmtExposure_No',
       'BsmtFinType1_GLQ', 'BsmtFinType1_Unf', 'BsmtQual_TA', 'CentralAir_Y',
       'Condition1_Norm', 'ExterQual_TA', 'Foundation_PConc',
       'GarageFinish_Unf', 'HeatingQC_TA', 'KitchenQual_TA', 'LotShape_Reg',
       'MSZoning_RL', 'MSZoning_RM', 'MasVnrType_None', 'Neighborhood_Edwards',
       'Neighborhood_NridgHt', 'PavedDrive_Y', 'RoofStyle_Gable',
       'SaleType_New'],
      dtype='object')

In [157]:
X_train=X_train[select_feat]

In [158]:
X_train.shape

(1460, 28)

In [159]:
X_train.head()

Unnamed: 0,1stFlrSF,Fireplaces,GarageCars,GrLivArea,MSSubClass,OverallQual,YearRemodAdd,BsmtExposure_Gd,BsmtExposure_No,BsmtFinType1_GLQ,...,KitchenQual_TA,LotShape_Reg,MSZoning_RL,MSZoning_RM,MasVnrType_None,Neighborhood_Edwards,Neighborhood_NridgHt,PavedDrive_Y,RoofStyle_Gable,SaleType_New
0,0.356155,0.0,0.5,0.577712,0.235294,0.666667,0.098361,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.503056,0.333333,0.5,0.470245,0.0,0.555556,0.52459,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,0.383441,0.333333,0.5,0.593095,0.235294,0.666667,0.114754,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.399941,0.333333,0.75,0.579157,0.294118,0.666667,0.606557,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.466237,0.333333,0.75,0.666523,0.235294,0.777778,0.147541,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [162]:
reg=LinearRegression()
reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [164]:
X_test=X_test[select_feat]
y_pred=reg.predict(X_test)
y_pred

array([[11.52676355],
       [11.91930427],
       [12.10313505],
       ...,
       [11.95320932],
       [11.69057712],
       [12.39913649]])

In [165]:
y_pred=np.exp(y_pred)

## as we had transformed 'SalePrice' into log
y_pred

array([[101393.42770913],
       [150137.11732124],
       [180436.65501851],
       ...,
       [155314.8018302 ],
       [119440.91784603],
       [242592.0469473 ]])

In [170]:
pred=pd.DataFrame(y_pred)
sample_file=pd.read_csv('sample_submission.csv')
final_df=pd.concat([sample_file['Id'],pred],axis=1)
final_df.columns=['Id','SalePrice']           

In [171]:
final_df

Unnamed: 0,Id,SalePrice
0,1461,101393.427709
1,1462,150137.117321
2,1463,180436.655019
3,1464,191681.774387
4,1465,200137.827320
...,...,...
1454,2915,82336.331238
1455,2916,93404.718435
1456,2917,155314.801830
1457,2918,119440.917846


In [173]:
final_df.to_csv('sample_submission.csv',index=False)