In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div class="text_cell_render border-box-sizing rendered_html">
<div style="color:white;
           display:fill;
           border-radius:18px;
           background-color:#298A08;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
<h1 style="text-align: center;
           padding: 10px;
              color:white">
House Prices - Regression with stepwise feature selection
</h1>
</div>
</div>

Since I've checked a large number of features in this dataset, I would try a technique that removes and selects features according to a performance indicator. It's not an optimal way that there r many powerful and effective models in sckit learn, for that reason I haven't seen in other notebooks yet.

<img src="https://livedoor.sp.blogimg.jp/anigei-mangabox/imgs/d/c/dc9966eb.png">

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
print(train_df.shape)
print(train_df.info())

---

# Check the distribution of target
Before progressing regression, some assumptions need to be checked. 

In [None]:
print(train_df['SalePrice'].describe())
print('mode:', train_df['SalePrice'].mode())

In [None]:
sns.distplot(train_df['SalePrice'])
plt.show()

In [None]:
log_SalePrice = np.log1p(train_df['SalePrice'])
sns.distplot(log_SalePrice)

---

# Correlation

In [None]:
corr = train_df.corr()
f = ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corr, vmax=0.8)

plt.show()  

In [None]:
plt.figure(figsize=(10, 10))
columns = corr.nlargest(10, 'SalePrice')['SalePrice'].index
corr_matrix = np.corrcoef(train_df[columns].values.T)
sns.set(font_scale = 1.25)
heat_map = sns.heatmap(corr_matrix, cbar=True, annot=True, square= True, fmt='.2f',
                      annot_kws={'size':10}, yticklabels=columns.values,
                      xticklabels=columns.values)
plt.show()

- direct correlation between several features 

- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet

In [None]:
# scatterplot with GarageCars, GarageArea
fig = plt.figure()
ax = sns.regplot(x=train_df['GarageCars'], y=train_df['GarageArea'])
plt.title('corr:0.88')

- GrLivArea: Above grade (ground) living area square feet
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

In [None]:
# scatterplot with GrLivArea, TotRmsAbvGrd
fig = plt.figure()
ax = sns.regplot(x=train_df['GrLivArea'], y=train_df['TotRmsAbvGrd'])
plt.title('corr:0.83')

- 1stFlrSF: First Floor square feet
- TotalBsmtSF: Total square feet of basement area

In [None]:
# scatterplot with 1stFlrSF, TotalVsmtSF
fig = plt.figure()
ax = sns.regplot(x=train_df['1stFlrSF'], y=train_df['TotalBsmtSF'])
plt.title('corr:0.82')

---

# Fill missing data
Before interpolating some NAN values, I've checked not a few columns had NAN values. So lets view only columns that have NAN values.

In [None]:
def find_nan_cols(df):
    null = df.isnull().sum()
    missing_df = pd.concat([null], axis=1, keys=['nancount'])

    return missing_df[(missing_df.nancount > 0)]

In [None]:
train_null = find_nan_cols(train_df)
print('🚩 missing value in train_df -----------------------------')
print(train_null)
test_null = find_nan_cols(test_df)
print('🚩 missing value in test_df ------------------------------')
print(test_null)

In [None]:
train_df = train_df.fillna(method='pad')
test_df = test_df.fillna(method='pad')
print(train_df.isnull().sum().sum())
print(test_df.isnull().sum().sum())

⬆️ hasn't been removed
- second try

In [None]:
train_null = find_nan_cols(train_df)
test_null = find_nan_cols(test_df)
print('🚩 missing value in train_df -----------------------------')
print(train_null)
print('🚩 missing value in test_df ------------------------------')
print(test_null)

- remove the columns above

In [None]:
train_df.drop(['Id','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], 1, inplace= True)
test_df.drop(['Id','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], 1, inplace= True)

In [None]:
print('NAN values in train_df:',train_df.isnull().sum().sum())
print('NAN values in test_df:',test_df.isnull().sum().sum())

---

# Categorical features

In [None]:
print('origint_train_df.shape()', train_df.shape)
train_df_ohe = pd.get_dummies(train_df)
print('train_df_ohe.shape()', train_df_ohe.shape)
train_df_ohe.head()

---

# ⛏️ Statsmodels ; Multiple Linear Regression

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

- To use OLS regression in Statsmodels, adding constants is necessary 
- $f(x)=w_0+w_1x_2+w_2x_2+...+w_px_p = [1\, x_1\, x_2 ... x_p]*[w_0\, w_1\, w_2,...,w_p]'$

In [None]:
# add bias
train_sm = sm.add_constant(train_df_ohe, has_constant='add')
train_sm.head()

In [None]:
feature_columns = list(train_sm.columns.difference(['SalePrice']))
X = train_sm[feature_columns]
y = train_sm['SalePrice']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3)

- model with all 271 features 

In [None]:
full_model = sm.OLS(train_y, train_x)
fitted_full_model = full_model.fit()

In [None]:
# fitted_full_model.summary()
# R^2 = 0.938, AIC = 2.351e+04

In [None]:
# residual of train_sm
res = fitted_full_model.resid

# qqplot ; normality analysis of residual
fig = sm.qqplot(res, fit=True, line='45')

In [None]:
pred_y = fitted_full_model.predict(train_x)

# residual pattern ; check homoscedasticity
fig = plt.scatter(pred_y, res, s=4)
plt.xlabel('Fitted values')
plt.ylabel('Residual')

In [None]:
pred_y2 = fitted_full_model.predict(test_x)
# residual plot
plt.plot(np.array(test_y-pred_y2), label='pred_full_features')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
# MSE 
mse = mean_squared_error(test_y, pred_y2)
# RMSE
rmse = np.sqrt(mse)
print('mse:', mse)
print('rmse:', rmse)

---

# ⛏️ Feature Selection (OLS Linear Regression, Stepwise)

Here using AIC as an performance indicator; as the number of unnecessary parameters -X variables-increases, penalties are given to evaluate the quality of the model. 

In [None]:
# return AIC
def processSubset(X, y, feature_set):
    model = sm.OLS(y, X[list(feature_set)])
    regr = model.fit()
    AIC = regr.aic
    return {'model':regr, 'AIC':AIC}

print(processSubset(X=train_x, y=train_y, feature_set=feature_columns[0:5]))

In [None]:
import itertools

# return the lowest AIC
def getBest(X, y, k):
    results = []
    for combo in itertools.combinations(X.columns.difference(['const']), k):
        combo = (list(combo)+['const'])
        
        results.append(processSubset(X, y, feature_set=combo))
    models = pd.DataFrame(results)
    best_model = models.loc[models['AIC'].argmin()]
    print('Processed', models.shape[0], 'models on', k)
    
    return best_model

print(getBest(train_x, train_y, k=2))

In [None]:
# considering k combinations of variables
# for combo in itertools.combinations(X.columns.difference(['const']), k):
#        combo = (list(combo)+['const'])

In [None]:
# print(getBest(train_x, train_y, k=10))

In [None]:
# step for forward selection 
def forward(X, y, predictors):
    remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
    results=[]
    for p in remaining_predictors:
        results.append(processSubset(X, y, feature_set=predictors+[p]+['const']))
    models = pd.DataFrame(results)
    
    best_model = models.loc[models['AIC'].argmin()]
    print('Selected predictors:', best_model['model'].model.exog_names, 'AIC:', best_model[0])
    return best_model

In [None]:
# forward selection
def forward_model(X,y):
    Fmodels = pd.DataFrame(columns=["AIC", "model"])
    
    predictors = []
    
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X,y=y,predictors=predictors)
        if i > 1:
            if Forward_result['AIC'] > Fmodel_before: # 변수를 추가하면서 AIC가 증가하면 stop
                break
        Fmodels.loc[i] = Forward_result
        predictors = Fmodels.loc[i]["model"].model.exog_names
        Fmodel_before = Fmodels.loc[i]["AIC"]
        predictors = [ k for k in predictors if k != 'const']

    return(Fmodels['model'][len(Fmodels['model'])])

In [None]:
Forward_best_model = forward_model(X=train_x, y= train_y)

In [None]:
Forward_best_model.aic

In [None]:
fitted_full_model.aic

In [None]:
print(fitted_full_model.params.shape)
print(Forward_best_model.params.shape)

In [None]:
pred_y_full = fitted_full_model.predict(test_x) # with all 271 features
pred_y_forward = Forward_best_model.predict(test_x[Forward_best_model.model.exog_names]) # with selected features by forward selection 

In [None]:
# selected features
Forward_best_model.model.exog_names

---

In [None]:
perf_mat = pd.DataFrame(columns=['All', 'Forward'], index=['MSE', 'RMSE', 'MAPE'])

from sklearn import metrics
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
perf_mat.loc['MSE']['All'] = metrics.mean_squared_error(test_y,pred_y_full)
perf_mat.loc['MSE']['Forward'] = metrics.mean_squared_error(test_y,pred_y_forward)

perf_mat.loc['RMSE']['All'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_full))
perf_mat.loc['RMSE']['Forward'] = np.sqrt(metrics.mean_squared_error(test_y, pred_y_forward))

perf_mat.loc['MAPE']['All'] = mean_absolute_percentage_error(test_y, pred_y_full)
perf_mat.loc['MAPE']['Forward'] = mean_absolute_percentage_error(test_y, pred_y_forward)

print(perf_mat)

---

To use the stepwise feature selection function, you can go through steps below.

In [None]:
 def backward(X,y,predictors):
    results = []
   
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(processSubset(X=X, y= y,feature_set=list(combo)+['const']))
    models = pd.DataFrame(results)
   
    best_model = models.loc[models['AIC'].argmin()]
    print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0] )
    return best_model

In [None]:
def Stepwise_model(X,y):
    Stepmodels = pd.DataFrame(columns=["AIC", "model"])
    predictors = []
    Smodel_before = processSubset(X,y,predictors+['const'])['AIC']
    
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X, y=y, predictors=predictors) # constant added
        print('forward')
        Stepmodels.loc[i] = Forward_result
        predictors = Stepmodels.loc[i]["model"].model.exog_names
        predictors = [ k for k in predictors if k != 'const']
        Backward_result = backward(X=X, y=y, predictors=predictors)  # Check if there is anything to remove
        if Backward_result['AIC']< Forward_result['AIC']:
            Stepmodels.loc[i] = Backward_result
            predictors = Stepmodels.loc[i]["model"].model.exog_names
            Smodel_before = Stepmodels.loc[i]["AIC"]
            predictors = [ k for k in predictors if k != 'const']
            print('backward')
        if Stepmodels.loc[i]['AIC']> Smodel_before:
            break
        else:
            Smodel_before = Stepmodels.loc[i]["AIC"]
    return (Stepmodels['model'][len(Stepmodels['model'])])

In [None]:
# Stepwise_best_model = Stepwise_model(X=train_x, y=train_y)