In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt


In [3]:
train = pd.read_csv('bigmart_train.csv')

In [4]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
train.shape

(8523, 12)

In [6]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [7]:
train['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [8]:
train['Outlet_Establishment_Year'].unique()

array([1999, 2009, 1998, 1987, 1985, 2002, 2007, 1997, 2004], dtype=int64)

In [9]:
train.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [10]:
train['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [11]:
train['Outlet_Size'].mode()[0]

'Medium'

In [12]:
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

In [13]:
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].mean())

In [14]:
train['Item_Visibility'].hist(bins=20)

<matplotlib.axes._subplots.AxesSubplot at 0x1f23c618dc8>

In [15]:
Q1 = train['Item_Visibility'].quantile(0.25)
Q3 = train['Item_Visibility'].quantile(0.75)
IQR = Q3-Q1
filt_train = train.query('(@Q1 - 1.5*@IQR) <= Item_Visibility <=(@Q3 + 1.5*@IQR)')

In [16]:
filt_train

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Medium,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,Medium,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [17]:
filt_train.shape, train.shape

((8379, 12), (8523, 12))

In [18]:
train = filt_train
train.shape

(8379, 12)

In [19]:
train['Item_Visibility_bins'] = pd.cut(train["Item_Visibility"], [0.000, 0.065, 0.13, 0.2], labels= ['Low viz', 'Viz', 'High viz'])

In [20]:
train['Item_Visibility_bins'].value_counts()

Low viz     4403
Viz         2557
High viz     893
Name: Item_Visibility_bins, dtype: int64

In [21]:
train['Item_Visibility_bins'] = train['Item_Visibility_bins'].replace(np.nan,'Low viz', regex= True)

In [22]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['Low Fat','LF'], 'Low Fat')

In [23]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(['Reg','Regular'])

In [24]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Visibility_bins
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Low viz
1,DRC01,5.92,Low Fat,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Low viz
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Low viz
3,FDX07,19.2,Low Fat,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,Low viz
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Low viz


In [25]:
le = LabelEncoder()

In [26]:
train['Item_Fat_Content'].unique()

array(['Low Fat', 'low fat', 'reg'], dtype=object)

In [28]:
train['Item_Fat_Content']=le.fit_transform(train['Item_Fat_Content'])

In [30]:
train['Item_Visibility_bins']=le.fit_transform(train['Item_Visibility_bins'])

In [32]:
train['Outlet_Size']=le.fit_transform(train['Outlet_Size'])

In [33]:
train['Outlet_Location_Type']=le.fit_transform(train['Outlet_Location_Type'])

In [34]:
dummy = pd.get_dummies(train['Outlet_Type'])
dummy.head()

Unnamed: 0,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,1,0,0


In [35]:
train = pd.concat([train,dummy], axis=1)

In [36]:
train.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content               int32
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                    int32
Outlet_Location_Type           int32
Outlet_Type                   object
Item_Outlet_Sales            float64
Item_Visibility_bins           int32
Grocery Store                  uint8
Supermarket Type1              uint8
Supermarket Type2              uint8
Supermarket Type3              uint8
dtype: object

In [46]:
train = train.drop(['Item_Identifier','Item_Type', 'Outlet_Identifier', 'Outlet_Type'], axis=1)

In [47]:
train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Item_Visibility_bins,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,9.3,0,0.016047,249.8092,1999,1,0,3735.138,1,0,1,0,0
1,5.92,0,0.019278,48.2692,2009,1,2,443.4228,1,0,0,1,0
2,17.5,0,0.01676,141.618,1999,1,0,2097.27,1,0,1,0,0
3,19.2,0,0.0,182.095,1998,1,2,732.38,1,1,0,0,0
4,8.93,0,0.0,53.8614,1987,0,2,994.7052,1,0,1,0,0


In [48]:
X  = train.drop('Item_Outlet_Sales', axis=1)
y = train.Item_Outlet_Sales

In [50]:
test = pd.read_csv('bigmart_test.csv')
test['Outlet_Size'] = test['Outlet_Size'].fillna('Medium')

In [52]:
test['Item_Visibility_bins'] = pd.cut(test['Item_Visibility'],[0.000, 0.065, 0.13, 0.2], labels=['Low Viz', 'Viz', 'High Viz']) 

In [53]:
test['Item_Weight'] = test['Item_Weight'].fillna(test['Item_Weight'].mean())

In [54]:
test["item_Visibility_bins"]=test["Item_Visibility_bins"].fillna('Low Viz')

In [56]:
test['Item_Visibility_bins'].head()

0    Low Viz
1    Low Viz
2        Viz
3    Low Viz
4        Viz
Name: Item_Visibility_bins, dtype: category
Categories (3, object): [Low Viz < Viz < High Viz]

In [58]:
dummy = pd.get_dummies(test['Outlet_Type'])
test = pd.concat([test, dummy], axis=1)

In [60]:
X_test = test.drop(['Item_Identifier', 'Item_Type', 'Outlet_Type', 'Outlet_Establishment_Year'], axis=1)

In [63]:
X.columns, X_test.columns

(Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
        'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
        'Item_Visibility_bins', 'Grocery Store', 'Supermarket Type1',
        'Supermarket Type2', 'Supermarket Type3'],
       dtype='object'),
 Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
        'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type',
        'Item_Visibility_bins', 'item_Visibility_bins', 'Grocery Store',
        'Supermarket Type1', 'Supermarket Type2', 'Supermarket Type3'],
       dtype='object'))

In [65]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3, random_state=42)

In [66]:
lin = LinearRegression()

In [67]:
lin.fit(x_train,y_train)
print(lin.coef_)
lin.intercept_

[    3.27601041   -40.07387027  -178.84562144    15.96734791
     2.29416066    16.04697231     7.49037199    17.94537058
 -1754.09993117   218.96250497  -121.89414725  1657.03157344]


-4810.218335262213

In [71]:
predictions = lin.predict(x_test)
print(sqrt(mean_squared_error(y_test,predictions)))

1119.0705464158477


In [73]:
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0.001, normalize=True)
ridgereg.fit(x_train,y_train)
print(sqrt(mean_squared_error(y_train, ridgereg.predict(x_train))))


1139.5330283218061


In [74]:
print(sqrt(mean_squared_error(y_test, ridgereg.predict(x_test))))

1119.0140188497276


In [75]:
print('R2 value/ coefficient of determination: {}'. format(ridgereg.score(x_test,y_test)))

R2 value/ coefficient of determination: 0.5481312029453348


In [78]:
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(x_train, y_train)
print(sqrt(mean_squared_error(y_train, lassoreg.predict(x_train))))
print(sqrt(mean_squared_error(y_test, lassoreg.predict(x_test))))
print(sqrt(mean_squared_error(y_train, lassoreg.predict(x_train))))
print('R2 value/ coefficient of determination: {}'. format(lassoreg.score(x_test,y_test)))

1139.5323661360028
1119.0601002626388
1139.5323661360028
R2 value/ coefficient of determination: 0.5480939859292611


In [82]:
from sklearn.linear_model import ElasticNet
Elas = ElasticNet(alpha=0.001, normalize=True)
Elas.fit(x_train, y_train)
print(sqrt(mean_squared_error(y_train, Elas.predict(x_train))))
print(sqrt(mean_squared_error(y_test, Elas.predict(x_test))))
print('R2 value/ coefficient of determination: {}'. format(Elas.score(x_test,y_test)))

1477.9664961649275
1429.3817022032167
R2 value/ coefficient of determination: 0.2627110332140946
