In [1]:
import plotly
plotly.tools.set_credentials_file(username='', api_key='')

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

#read files
train = pd.read_csv("data/train_modified.csv")
test = pd.read_csv("data/test_modified.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 38 columns):
Item_Identifier             8523 non-null object
Item_MRP                    8523 non-null float64
Item_Outlet_Sales           8523 non-null float64
Item_Visibility             8523 non-null float64
Item_Weight                 8523 non-null float64
Outlet_Identifier           8523 non-null object
Item_Type_Avg_Sale          8523 non-null float64
Item_Type_Sale_Frequency    8523 non-null float64
Years_Open                  8523 non-null int64
Outlet                      8523 non-null int64
Item_Fat_Content_0          8523 non-null int64
Item_Fat_Content_1          8523 non-null int64
Item_Type_0                 8523 non-null int64
Item_Type_1                 8523 non-null int64
Item_Type_2                 8523 non-null int64
Item_Type_3                 8523 non-null int64
Item_Type_4                 8523 non-null int64
Item_Type_5                 8523 non-null int64
Item_Type_6    

# FEATURE SELECTION

#### Review highly correlated features 

In [4]:
corr_matrix = train.corr().abs()
#corr_matrix

In [5]:
data = [go.Heatmap(z=corr_matrix.values.tolist(), colorscale='Viridis')]
py.iplot(data, filename='feature_corr')

#### Remove features with correlation greater than 0.6

In [6]:
corr_var_60=np.where(corr_matrix>0.6)
corr_var_60=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*corr_var_60) if x!=y and x<y]
corr_var_60

[('Item_Type_Avg_Sale', 'Item_Type_Sale_Frequency'),
 ('Item_Type_Avg_Sale', 'Item_Type_12'),
 ('Outlet', 'Outlet_Location_Type_0'),
 ('Outlet', 'Outlet_Location_Type_2'),
 ('Item_Fat_Content_0', 'Item_Fat_Content_1'),
 ('Outlet_Location_Type_1', 'Outlet_Size_2'),
 ('Outlet_Location_Type_2', 'Outlet_Size_2'),
 ('Outlet_Location_Type_2', 'Outlet_Type_1'),
 ('Outlet_Size_1', 'Outlet_Size_2')]

In [7]:
train.drop(['Item_Type_Avg_Sale','Item_Fat_Content_0','Outlet_Size_1','Outlet_Size_2','Outlet_Location_Type_0','Outlet_Location_Type_1','Outlet_Location_Type_2'], axis=1, inplace=True)

In [8]:
corr_matrix2 = train.corr().abs()
corr_var_60=np.where(corr_matrix2>0.6)
corr_var_60=[(corr_matrix2.columns[x],corr_matrix2.columns[y]) for x,y in zip(*corr_var_60) if x!=y and x<y]
corr_var_60

[]

#### Review the relationship between each independent variable and Item_Outlet_Sales. If there is no correlation, remove the feature because there is no relationship between the changes in the independent variable and shifts Item_Outlet_Sales

In [9]:
target = 'Item_Outlet_Sales'
IDcol = ['Item_Identifier','Outlet_Identifier']
predictors = [x for x in train.columns if x not in [target]+IDcol]

X = train[predictors]
y = train[target]

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
est2.summary()

0,1,2,3
Dep. Variable:,Item_Outlet_Sales,R-squared:,0.563
Model:,OLS,Adj. R-squared:,0.561
Method:,Least Squares,F-statistic:,437.3
Date:,"Wed, 06 Mar 2019",Prob (F-statistic):,0.0
Time:,07:56:36,Log-Likelihood:,-71998.0
No. Observations:,8523,AIC:,144000.0
Df Residuals:,8497,BIC:,144200.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.2138,67.437,-0.137,0.891,-141.406,122.979
Item_MRP,15.5650,0.198,78.660,0.000,15.177,15.953
Item_Visibility,-206.1571,258.081,-0.799,0.424,-712.058,299.744
Item_Weight,0.2213,2.659,0.083,0.934,-4.990,5.433
Item_Type_Sale_Frequency,-4.4367,4.007,-1.107,0.268,-12.292,3.419
Years_Open,-1.9928,7.181,-0.278,0.781,-16.069,12.083
Outlet,-5.2154,13.066,-0.399,0.690,-30.827,20.396
Item_Fat_Content_1,40.3537,28.262,1.428,0.153,-15.046,95.753
Item_Type_0,-0.4238,43.637,-0.010,0.992,-85.963,85.115

0,1,2,3
Omnibus:,961.02,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2294.587
Skew:,0.667,Prob(JB):,0.0
Kurtosis:,5.164,Cond. No.,1.23e+16


# LINEAR REGRESSION MODEL

In [10]:
target = 'Item_Outlet_Sales'
IDs = ['Item_Identifier','Outlet_Identifier']
predictors = [x for x in train.columns if x not in [target]+IDs]

In [11]:
lreg = LinearRegression(normalize=True)

In [12]:
lreg.fit(train[predictors], train[target])
train_predictions = lreg.predict(train[predictors])
train_predictions


internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.



array([3948.,  524., 2308., ..., 1444., 1404., 1220.])

#### Cross Validation using neg_mean_squared_error

In [13]:
cv_score = cross_val_score(lreg, train[predictors], train[target], cv=20, scoring='neg_mean_squared_error')
cv_score = np.sqrt(np.abs(cv_score))
cv_score

array([1215.3183843 , 1121.21415393, 1140.4419186 , 1142.04583289,
       1141.06171357, 1105.15103434, 1092.12653589, 1143.34005522,
       1123.7040707 , 1115.64990499, 1084.9321737 , 1136.64505209,
       1153.06743385, 1078.48502928, 1085.34404603, 1193.61512484,
       1193.08883191, 1078.81554534, 1212.19833312, 1081.49390162])

In [14]:
#RMSE
np.sqrt(metrics.mean_squared_error(train[target].values, train_predictions))

1128.513462641595

In [15]:
df = pd.DataFrame(lreg.coef_, predictors)
df.columns = ['coef']
df.sort_values('coef', inplace=True)
df

Unnamed: 0,coef
Item_Type_6,-1.658177e+16
Item_Type_13,-1.612992e+16
Item_Type_9,-7827165000000000.0
Item_Type_5,-5530258000000000.0
Item_Type_4,-70394780000000.0
Item_Visibility,-163.8607
Outlet_Size_0,-17.90986
Outlet,-4.136823
Years_Open,-2.147117
Item_Weight,0.3472157


In [16]:
trace1 = go.Bar(
    x=df.index,
    y=df.coef,
    name='coef'
)
data = [trace1]
layout = go.Layout(
    barmode='group' #stack
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

In [17]:
#Adjusted R Squared
lreg.score(train[predictors], train[target])

0.5626269112598592

#### 56.26% of variance in sales is explained by the model

In [18]:
#Predict test data
test[target] = lreg.predict(test[predictors])

In [19]:
#Submission File
IDcol.append(target)
submission = pd.DataFrame({ x: test[x] for x in IDcol})
submission.to_csv('big_mart_sales_final.csv', index=False)