# Feature Selection Advanced House Price Prediction
The main aim of this project is to predict the house price based on various features which we will discuss as we go ahead

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# For Feature Selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [2]:
df=pd.read_csv('x_train_final.csv')
df

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontagenan,MasVnrAreanan,BsmtFullBathnan,BsmtHalfBathnan,GarageYrBltnan
0,812,11.881035,0.588235,0.333333,0.434909,0.240309,1.0,1.0,0.000000,0.333333,...,0.0,0.454545,0.50,0.333333,0.75,1.0,0.0,0.0,0.0,0.0
1,2430,12.001505,0.000000,0.666667,0.423859,0.362838,1.0,1.0,0.333333,0.333333,...,0.0,0.636364,0.25,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
2,535,12.089539,0.235294,0.666667,0.466207,0.379899,1.0,1.0,0.333333,0.333333,...,0.0,0.818182,0.00,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
3,2415,12.001505,0.411765,0.666667,0.445638,0.357532,1.0,1.0,0.000000,0.333333,...,0.0,1.000000,0.25,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
4,480,11.401670,0.058824,0.333333,0.321097,0.296867,1.0,1.0,0.000000,0.000000,...,0.0,0.181818,0.25,0.666667,0.25,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,764,12.727838,0.235294,0.666667,0.504203,0.387820,1.0,1.0,0.000000,0.333333,...,0.0,0.545455,0.75,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
1455,836,11.759786,0.000000,0.666667,0.388581,0.391317,1.0,1.0,0.000000,0.333333,...,0.0,0.090909,1.00,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
1456,1654,12.001505,0.823529,0.666667,0.049425,0.112345,1.0,1.0,0.000000,0.333333,...,0.0,0.727273,0.75,0.666667,0.75,0.0,0.0,0.0,0.0,0.0
1457,2608,12.001505,0.000000,0.666667,0.394699,0.638722,1.0,1.0,0.333333,0.333333,...,0.0,0.363636,0.25,0.666667,0.75,0.0,0.0,0.0,0.0,0.0


In [3]:
# Dropping nan value
df=df.dropna()

In [4]:
df.shape

(1457, 86)

In [5]:
# Capture the dependent feature
y_train=df[['SalePrice']]

In [6]:
## Drop the id and sales price
x_train=df.drop(['Id','SalePrice'],axis=1)

In [7]:
## Apply Feature selection
# Apply Lasso and select from model
# First I specify the lasso Regression model amd will select a suitable alpha(equivalent of penalty)

# The bigger the alpha , the less features that will be selected
# Then I use the SelectFromModel object from sklearn,which will select the features which coefficients are non zero

feature_sel_model=SelectFromModel(Lasso(alpha=0.005,random_state=0))
feature_sel_model.fit(x_train,y_train)
# Lasso Model is used
# From select model :-selecting features based on importance weights.
# Weight are being assigned to lasso

In [8]:
feature_sel_model.get_support() # Count of all the independent features
# True Means important
# False means not important

array([False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False,  True,  True, False,  True, False,  True, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [9]:
# Print the total number of  and selected features
# Make a list

selected_feature=x_train.columns[(feature_sel_model.get_support())]
selected_feature

Index(['Neighborhood', 'YearRemodAdd', 'MasVnrType', 'BsmtExposure',
       'GrLivArea', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageArea', 'GarageCond'],
      dtype='object')

In [10]:
# Let's Print some stat
print('Total Features: {}'.format((x_train.shape[1])))
print('Selected Features: {}'.format(len(selected_feature)))
print('Features with coefficient shrank to zero: {}'.format(np.sum(feature_sel_model.estimator_.coef_==0))) 
# Features which are not necessary are actualy converted to zero with the help of lasso

Total Features: 84
Selected Features: 11
Features with coefficient shrank to zero: 73


In [11]:
x_train=x_train[selected_feature]
x_train

Unnamed: 0,Neighborhood,YearRemodAdd,MasVnrType,BsmtExposure,GrLivArea,KitchenQual,FireplaceQu,GarageType,GarageFinish,GarageArea,GarageCond
0,0.666667,0.096774,0.666667,1.00,0.329603,0.75,0.8,0.8,1.000000,0.282258,1.0
1,0.380952,0.661290,0.666667,0.50,0.370527,0.25,0.2,0.8,0.333333,0.193548,1.0
2,0.619048,0.064516,0.000000,0.75,0.510475,0.75,0.8,0.8,1.000000,0.270833,1.0
3,0.380952,0.822581,1.000000,0.25,0.545235,0.25,0.2,0.2,0.333333,0.325269,1.0
4,0.238095,0.145161,0.333333,0.25,0.431475,0.25,0.2,0.2,0.333333,0.451613,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1454,1.000000,0.193548,0.666667,0.50,0.692428,0.75,0.8,0.8,0.666667,0.575269,1.0
1455,0.285714,0.274194,0.000000,0.25,0.410869,0.75,0.2,0.8,0.333333,0.293011,1.0
1456,0.523810,0.564516,0.000000,0.25,0.542504,0.25,0.6,0.2,0.666667,0.295699,1.0
1457,0.428571,0.241935,0.000000,0.50,0.570608,0.75,0.6,0.8,0.666667,0.451613,1.0


In [None]:
# Now I can train my model easily on the [selected Feature dataset] and decide my model
