In [None]:
import os
os.chdir('???')
os.getcwd()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
orig_df = pd.read_csv("big_mart_sales.csv")
orig_df.head()

In [None]:
orig_df.shape

In [None]:
orig_df.dtypes

In [None]:
orig_df.Outlet_Size.unique()

In [None]:
orig_df.Item_Fat_Content.unique()

In [None]:
# Original, there 11 features (variables) and 1 output ('Item_Outlet_Sales').
# To increase number of features, we change categorcial data column to numerical data.
# To turn categorcial data column, get_dummies function is used.
# Example: if there exists 1 feature column called gender with 2 values {male, female}, 
#         get_dummies function will converts to 2 columns: gender_male, gender_female (with values 0,1)

# Note that 'Item_Identifier', 'Outlet_Identifier' are IDs; they have too many unique values 
# If we convert these two columns to numerical data, we will get too many resulting columns 
# Hence, we will drop these two columns

df=orig_df.copy()
temp_df=df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)

dummy_df=pd.get_dummies(temp_df)  
dummy_df.columns    
# dummy_df has 35 columns 

In [None]:
dummy_df.columns

In [None]:
input_vars = ['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Fat_Content_LF',
       'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular',
       'Item_Fat_Content_low fat', 'Item_Fat_Content_reg',
       'Item_Type_Baking Goods', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods', 'Outlet_Size_High',
       'Outlet_Size_Medium', 'Outlet_Size_Small',
       'Outlet_Location_Type_Tier 1', 'Outlet_Location_Type_Tier 2',
       'Outlet_Location_Type_Tier 3', 'Outlet_Type_Grocery Store',
       'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2',
       'Outlet_Type_Supermarket Type3']
len(input_vars)

In [None]:
# Perform regresssion with original 35 independent variables

df = dummy_df.dropna()
X = df[input_vars]
y = df.Item_Outlet_Sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

# Filter Methods

## Missing Value Ratio

In [None]:
# Use heatmap to visualize missing value (null) positions
df=dummy_df.copy()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
# Find missing value percent for each variable
null_percent = df.isnull().sum()/len(df)*100
null_percent

In [None]:
# Replace missing values with mean
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)  

In [None]:
# After replacing missing values, re-check %missing data
df.isnull().sum()/len(df)*100

In [None]:
## Perform regresssion with 35 original independent features
##         where missing values in Item_Weight is replaced with its mean

X = df[input_vars]
y = df.Item_Outlet_Sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
# Instead of replacing missing values, let's drop features that have %missing ratios > threshold
# Let's set threshold = acceptable_missing_ratio = 15 (otherwise, Item_Weight will not be dropped if we set higher than this.)

# Below is to get name of features that has %missing values <= acceptable_missing_ratio
# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of features with %missing values <= acceptable_missing_ratio

df = dummy_df.copy()
acceptable_missing_ratio = 15
updated_vars = []
for i in range(0,len(input_vars)):  
    # if %missing value of this feature is less than or at least threshold, keep this feature
    if null_percent[i] <= acceptable_missing_ratio :          
        updated_vars.append(input_vars[i])
updated_vars

In [None]:
len(updated_vars) 
# Since one column (Item_weight) has %missing data > 15%, number of features decreases from 35 to 34.

In [None]:
## Perform regresssion with 34 independent variables
##         Item_Weight with %missing values above threshold is dropped

X = df[updated_vars]
y = df.Item_Outlet_Sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

## Low Variance Filtering

In [None]:
# Compute variance of each feature
df = dummy_df.copy()
df.var()

In [None]:
# After scanning variances from all features above, set minimum variance threshold = 10 
# Note that minimum variance threshold can be set to any value

# Below is to remove features with low variances
# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of features with variance >= minimum variance threshold

min_var_threshold = 10
updated_vars = [ ]
for i in range(0,len(df[input_vars].var())):

    if df[input_vars].var()[i] >= min_var_threshold:   
        updated_vars.append(df[input_vars].columns[i])
updated_vars

In [None]:
# Above, there are 3 features with variance >= minimum variance threshold
# Create new data frame with these 3 features and 1 output

df2 = df[updated_vars]
df2['Item_Outlet_Sales']=df.Item_Outlet_Sales
df2.head()

In [None]:
## Perform regresssion with 3 features with variance >= minimum variance threshold

df2 = df2.dropna()
X = df2[updated_vars]
y = df2['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

## High Correlation Filtering

In [None]:
# Plot correlation of numerical data
df = dummy_df.copy()
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),
            square=True,
            linewidths=0.25,    
            linecolor=(0,0,0),
            cmap=sns.color_palette("coolwarm"),
            annot=False)

In [None]:
## Perform regresssion with 35 original independent variables

df = df.dropna()
X = df[input_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
df.shape

In [None]:
# Look at correlation matrix, 
# since Item_Fat_Content_Low Fat and Item_Fat_Content_Regular are highly correlated, 
# drop one of them

df.drop('Item_Fat_Content_Low Fat', 1, inplace=True)
df.shape

In [None]:
# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of features without Item_Fat_Content_Low Fat

updated_vars = list(input_vars)
updated_vars.remove('Item_Fat_Content_Low Fat')
len(updated_vars )

In [None]:
## Perform regresssion with 34 original independent variables
##         with dropping one of 2 features with high collinearity: 'Item_Fat_Content_Low Fat' & 'Item_Fat_Content_Regular'

df = df.dropna()
X = df[updated_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

# After dropping one feature, model performance is improved 
# Using all 35 original features -> [1191.4341205493079, 0.4132916031409223]

In [None]:
# let's try further
# Since Item_MRP is highly correlated with Item_Outlet_Sales, let's drop this Item_MRP and see the performance

df=dummy_df.copy()
df=df.drop('Item_MRP', 1)
df.shape

In [None]:
updated_vars = list(input_vars)
updated_vars.remove('Item_MRP')
len(updated_vars)

In [None]:
## Perform regresssion with 34 original independent variables
##         with dropping feature with highest correlation with output (Item_MRP)

df = df.dropna()
X = df[updated_vars]
y = df.Item_Outlet_Sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

# As you can see, when high-correlated feature with output is dropped.
#                 the performance drops significantly 

# Wrapper Methods

## Forward Feature Selection

In [None]:
df = dummy_df.copy()
df = df.dropna()

In [None]:
# f_regression receives input features and output
#              and returns f-statistic and p-value for f-test 
# f-test is to used to test whether model fits data well or not 
# the more f-statistic (the lower p-value), the better

from sklearn.feature_selection import f_regression
fstat, pval = f_regression(df[input_vars], df.Item_Outlet_Sales)
fstat

In [None]:
pval

In [None]:
# Set threshold to select subset of features based on f-test statistic

# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of features selected from f-test statistic from forward feature selection

f_value_threshold = 10  # can adjust this value 
updated_vars = []
for i in range(0,len(input_vars)-1):
    if fstat[i] >= f_value_threshold:
        updated_vars.append(df[input_vars].columns[i])
updated_vars

In [None]:
## Perform regresssion with 12 features selected from f-test statistic from forward feature selection

X = df[updated_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

## Recursive Feature Elimination

In [None]:
df = dummy_df.copy()
df = df.dropna()

In [None]:
# RFE receives fit model and number of features to select (which can be adjusted)
# rfe receives input features and output 

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X = df[input_vars]
y = df['Item_Outlet_Sales']
n_features = 10  # define by yourself to select important features

rfe = RFE(LinearRegression(), n_features)
model = rfe.fit(X, y)

In [None]:
# rfe masks features that are selected as 1
model.ranking_

In [None]:
# to obtain coefficients of features in regression model
model.estimator_.coef_

In [None]:
# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of features selected from recursive feature elimination

updated_vars = []
for i in range(0,len(input_vars)-1):
    if model.ranking_[i] == 1:
        updated_vars.append(df[input_vars].columns[i])
updated_vars

In [None]:
## Perform regresssion with features selected from recursive feature elimination

X = df[updated_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

# R2 using features selected from rfe is quite low
# With this data, about 27 features need to be selected to get R2 > 0.5

In [None]:
# If you don't need to know which features get selected from rfe, 
#     you can use result model to predict immediately

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X = df[input_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
n_features = 10  # define by yourself to select important features

rfe = RFE(LinearRegression(), n_features)
model = rfe.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

# Embedded Methods

## Regression & Lasso (L1)

In [None]:
df = dummy_df.copy()
df = df.dropna()
X = df[input_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

In [None]:
# Perform regresssion with original 35 independent variables

lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
from sklearn.linear_model import Lasso

## Perform regresssion with embedded lasso 
## set alpha (weight of penalty term) = 1

lasso = Lasso(alpha=1)
model = lasso.fit(X_train, y_train)
y_pred = model.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
## We can look at values of coefficients of regression model 

[model.coef_, model.intercept_]

In [None]:
##  Filter features that has non-zero coefficients and sort coefficient from min to max
##  Note that argsort returns indices of array after sort
##  At the end, coef array = non-zero coeffients, sorted from min to max
##              update_vars = names of features corresponding to coef array

index = np.argsort(model.coef_)
coef = [model.coef_[index[i]] for i in range(len(model.coef_))  if model.coef_[index[i]] != 0]
update_vars = [input_vars[index[i]] for i in range(len(model.coef_))  if model.coef_[index[i]] != 0]
coef

In [None]:
update_vars

In [None]:
## Find out how many features have left after performing lasso with alpha = 1

len(update_vars)

In [None]:
## Run another regression with lasso + alpha = 10

lasso2 = Lasso(alpha=10)
model2 = lasso2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred2)]

In [None]:
[model2.coef_, model2.intercept_]

In [None]:
##  Filter features that has non-zero coefficients and sort coefficient from min to max

index = np.argsort(model2.coef_)
coef2 = [model2.coef_[index[i]] for i in range(len(model2.coef_))  if model2.coef_[index[i]] != 0]
update_vars2 = [input_vars[index[i]] for i in range(len(model2.coef_))  if model2.coef_[index[i]] != 0]
coef2

In [None]:
update_vars2

In [None]:
## Find out how many features have left after performing lasso with alpha = 10

len(update_vars2)

## Since we use larger alpha, the penalty term has more weight.
## More coefficients of regression model will be zero. 

In [None]:
## Plot coefficients of linear regression model, regression with lasso + alpha=1,  regression with lasso + alpha=10
## Note that alpha parameter in plot indicates transparency of marker (it has nothing to do with alpha in lasso)

plt.plot(input_vars,lm.coef_,alpha=0.5,linestyle='none',marker='*',markersize=10,color='red',label=r'Linear Regression',zorder=7) # zorder for ordering the markers
plt.plot(input_vars,model.coef_,alpha=0.5,linestyle='none',marker='o',markersize=10,color='blue',label=r'Lasso; $\alpha = 1$') # alpha here is for transparency
plt.plot(input_vars,model2.coef_,alpha=0.5,linestyle='none',marker='d',markersize=15,color='green',label=r'Lasso; $\alpha = 10$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc='lower left')
plt.xticks(input_vars, input_vars, rotation='vertical')
plt.show()

## Regression & Ridge (L2)

In [None]:
from sklearn.linear_model import Ridge

## Perform regresssion with ridge 
## set alpha (weight of penalty term) = 1

X = df[input_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)

ridge = Ridge(alpha=1.0)
model = ridge.fit(X_train, y_train)
y_pred = model.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
[model.coef_, model.intercept_]

In [None]:
##  Filter features that has non-zero coefficients and sort coefficient from min to max
##  Note that argsort returns indices of array after sort
##  At the end, coef array = non-zero coeffients, sorted from min to max
##              update_vars = names of features corresponding to coef array

index = np.argsort(model.coef_)
coef = [model.coef_[index[i]] for i in range(len(model.coef_))  if model.coef_[index[i]] != 0]
update_vars = [input_vars[index[i]] for i in range(len(model.coef_))  if model.coef_[index[i]] != 0]
coef

In [None]:
update_vars

In [None]:
## Find out how many features have left after performing ridge with alpha = 1

len(update_vars)

In [None]:
## Run another regression with ridge + alpha = 10

ridge = Ridge(alpha=10)
model2 = ridge.fit(X_train, y_train)
y_pred = model2.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]

In [None]:
[model2.coef_, model2.intercept_]

In [None]:
len(update_vars2)

## Using ridge with alpha = 10 gives number of features = 34, like using alpha = 1
## Hence, regression with ridge model does not help with feature selection much

In [None]:
## Plot coefficients of linear regression model, regression with ridge + alpha=1,  regression with ridge + alpha=10
## Note that alpha parameter in plot indicates transparency of marker (it has nothing to do with alpha in ridge)

plt.plot(input_vars,lm.coef_,alpha=0.5,linestyle='none',marker='*',markersize=10,color='red',label=r'Linear Regression',zorder=7) # zorder for ordering the markers
plt.plot(input_vars,model.coef_,alpha=0.5,linestyle='none',marker='o',markersize=10,color='blue',label=r'Ridge; $\alpha = 1$') # alpha here is for transparency
plt.plot(input_vars,model2.coef_,alpha=0.5,linestyle='none',marker='d',markersize=15,color='green',label=r'Ridge; $\alpha = 10$') # alpha here is for transparency
plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc='lower left')
plt.xticks(input_vars, input_vars, rotation='vertical')
plt.show()

## Random Forest

In [None]:
df = dummy_df.copy()
df = df.dropna()

In [None]:
# RandomForestRegressor receives parameters for random forest such as depth of tree, number of trees

# n_features = number of selected features

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=10,n_estimators=100)
model.fit(df[input_vars],df.Item_Outlet_Sales)

n_features = 15

# Plot feature importance
features = df[input_vars].columns
importances = model.feature_importances_
indices = np.argsort(importances)[-n_features:]  # sort top n features
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

# Choose features with large feature importance

In [None]:
# Note that input_vars = a list of 35 original features
# Here, updated_vars = a list of 15 features selected from feature importance of random forest
important_features = [features[i] for i in indices]
important_features
updated_vars = []
for i in range(-1,-n_features-1,-1):
    updated_vars.append(important_features[i])
updated_vars

In [None]:
## Perform regresssion with features selected from feature importance of random forest

X = df[updated_vars]
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99)
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)
[np.sqrt(metrics.mean_squared_error(y_test,y_pred)),metrics.r2_score(y_test,y_pred)]