In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import seaborn as sns

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/Raviputran/Raviputran/main/train_Big%20Mart.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Raviputran/Raviputran/main/test_Big%20Mart.csv')

In [3]:
Data_types_train = train.dtypes
print(Data_types_train)

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object


In [4]:
Data_types_test = test.dtypes
print(Data_types_test)

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object


In [5]:
# To find the number of NaN values in train dataset columnwise
train.isna().sum()/len(train)*100

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

In [6]:
# To find the number of NaN values in test dataset columnwise
test.isna().sum()/len(test)*100

Item_Identifier               0.000000
Item_Weight                  17.180074
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.269671
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
dtype: float64

In [7]:
# Finding 1:
# Same imputing method can be applied for both train and test datasets since the percentage of missing values in train and test
# are almost equal
# Finding 2:
# There could be 7 text columns Item_Identifier, Item_Fat_Content, Item_Visibility, Item_Type, Outlet_Identifier, Outlet_Size,
# Outlet_Location_Type, and Outlet_Type in the datasets. Need to check using value_counts to count unique values of a column.
# Syntax for value_counts is dataframe_name.column_name.value_counts()

In [8]:
# Applying value_counts for Item_Identifier
train.Item_Identifier.value_counts()

FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [9]:
# Applying value_counts for Item_Fat_Content
train.Item_Fat_Content.value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [10]:
test.Item_Identifier.value_counts()

DRF48    8
FDZ50    8
FDQ60    8
FDN52    8
FDW10    8
        ..
FDW49    1
FDG38    1
FDD38    1
FDA15    1
FDF04    1
Name: Item_Identifier, Length: 1543, dtype: int64

In [11]:
# Low Fat, LF, low fat need to be denoted by a single entity
# Regular, reg need to be denoted by a single entity
# replace() replaces with the most used value. Syntax: train['Item_Fat_Content'].replace()
train = train.replace(['LF', 'low fat'],'Low Fat')
train = train.replace('reg','Regular')

In [12]:
print(train.Item_Fat_Content)

0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
         ...   
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, Length: 8523, dtype: object


In [13]:
train.Item_Fat_Content.value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [14]:
test = test.replace(['LF', 'low fat'],'Low Fat')
test = test.replace('reg','Regular')

In [15]:
test.Item_Fat_Content.value_counts()

Low Fat    3668
Regular    2013
Name: Item_Fat_Content, dtype: int64

In [16]:
# Applying value_counts for Item_Visibility
train.Item_Visibility.value_counts()

0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: Item_Visibility, Length: 7880, dtype: int64

In [17]:
test.Item_Visibility.value_counts()

0.000000    353
0.060879      2
0.097213      2
0.058461      2
0.058132      2
           ... 
0.082382      1
0.026175      1
0.155144      1
0.069557      1
0.104720      1
Name: Item_Visibility, Length: 5277, dtype: int64

In [18]:
# Applying value_counts for Item_Type
train.Item_Type.value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [19]:
test.Item_Type.value_counts()

Snack Foods              789
Fruits and Vegetables    781
Household                638
Frozen Foods             570
Dairy                    454
Baking Goods             438
Canned                   435
Health and Hygiene       338
Meat                     311
Soft Drinks              281
Breads                   165
Hard Drinks              148
Starchy Foods            121
Others                   111
Breakfast                 76
Seafood                   25
Name: Item_Type, dtype: int64

In [20]:
# Assumption: Frozen Foods are not double counted in Dairy and Seafood and in turn some Dairy is not double counted in 
# Canned

In [21]:
# Applying value_counts for Outlet_Size
train.Outlet_Size.value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [22]:
test.Outlet_Size.value_counts()

Medium    1862
Small     1592
High       621
Name: Outlet_Size, dtype: int64

In [23]:
# Applying value_counts for Outlet_Location_Type
train.Outlet_Location_Type.value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [24]:
test.Outlet_Location_Type.value_counts()

Tier 3    2233
Tier 2    1856
Tier 1    1592
Name: Outlet_Location_Type, dtype: int64

In [25]:
# Applying value_counts for Outlet_Type
train.Outlet_Type.value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [26]:
test.Outlet_Type.value_counts()

Supermarket Type1    3717
Grocery Store         722
Supermarket Type3     624
Supermarket Type2     618
Name: Outlet_Type, dtype: int64

In [27]:
train['source'] = 'train'
test['source'] = 'test'
test['Item_Outlet_Sales'] = 0
df = pd.concat([train,test])
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [30]:
df.shape

(14204, 13)

In [28]:
# Check for Imputation 
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,11765.0,14204.0,14204.0,14204.0,14204.0
mean,12.792854,0.065953,141.004977,1997.830681,1308.865489
std,4.652502,0.051459,62.086938,8.371664,1699.791423
min,4.555,0.0,31.29,1985.0,0.0
25%,8.71,0.027036,94.012,1987.0,0.0
50%,12.6,0.054021,142.247,1999.0,559.272
75%,16.75,0.094037,185.8556,2004.0,2163.1842
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [29]:
df.isna().sum()/len(train)*100

Item_Identifier               0.000000
Item_Weight                  28.616684
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  47.119559
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
source                        0.000000
dtype: float64

In [31]:
# Fill Item_Weight using Item_Identifier and Item_Type
df['Item_Weight'] = df.groupby(['Item_Identifier', 'Item_Type']).Item_Weight.transform(lambda x: x.fillna(x.mode()[0]))

In [32]:
df.isna().sum()/len(train)*100

Item_Identifier               0.000000
Item_Weight                   0.000000
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  47.119559
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
source                        0.000000
dtype: float64

In [33]:
# Fill Outlet_Size
df['Outlet_Size'] = df.groupby(['Outlet_Type']).Outlet_Size.transform(lambda x: x.fillna(x.mode()[0]))

In [34]:
df.isna().sum()/len(train)*100

Item_Identifier              0.0
Item_Weight                  0.0
Item_Fat_Content             0.0
Item_Visibility              0.0
Item_Type                    0.0
Item_MRP                     0.0
Outlet_Identifier            0.0
Outlet_Establishment_Year    0.0
Outlet_Size                  0.0
Outlet_Location_Type         0.0
Outlet_Type                  0.0
Item_Outlet_Sales            0.0
source                       0.0
dtype: float64

In [35]:
# Check for df shape
df.shape

(14204, 13)

In [36]:
# Copy Item_Identifier, Source and Item_Outlet_Sales in a separate DaraFrame
Item_ID_n_source_Item_Outlet_Sales = df[['Item_Identifier', 'source', 'Item_Outlet_Sales']].copy()

In [37]:
Item_ID_n_source_Item_Outlet_Sales.head()

Unnamed: 0,Item_Identifier,source,Item_Outlet_Sales
0,FDA15,train,3735.138
1,DRC01,train,443.4228
2,FDN15,train,2097.27
3,FDX07,train,732.38
4,NCD19,train,994.7052


In [38]:
# Extract the numerical colums. Ignore Outlet_Establishment_Year since it is redundant
df_num_only = df.drop(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Establishment_Year', 'source', 'Item_Outlet_Sales'], axis=1)

In [39]:
df_num_only.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
0,9.3,0.016047,249.8092
1,5.92,0.019278,48.2692
2,17.5,0.01676,141.618
3,19.2,0.0,182.095
4,8.93,0.0,53.8614


In [40]:
# Normalize numerical values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df_num_only)
print(standardized_data)

[[-0.75101393 -0.96985228  1.75251118]
 [-1.47765312 -0.90706338 -1.4936965 ]
 [ 1.01183853 -0.95600038  0.00987397]
 ...
 [-0.60052652  0.14722609 -0.35855788]
 [ 0.53887812 -1.28171205  1.18574722]
 [-0.70801753  0.7533967  -0.98589388]]


In [41]:
standardized_df = pd.DataFrame(standardized_data, columns=df_num_only.columns)

In [42]:
print(standardized_df)

       Item_Weight  Item_Visibility  Item_MRP
0        -0.751014        -0.969852  1.752511
1        -1.477653        -0.907063 -1.493696
2         1.011839        -0.956000  0.009874
3         1.377308        -1.281712  0.661838
4        -0.830557        -1.281712 -1.403623
...            ...              ...       ...
14199    -0.493036        -1.019425  0.005000
14200    -1.116483         1.497142  0.453249
14201    -0.600527         0.147226 -0.358558
14202     0.538878        -1.281712  1.185747
14203    -0.708018         0.753397 -0.985894

[14204 rows x 3 columns]


In [43]:
standardized_df.shape

(14204, 3)

In [44]:
standardized_df.isna().sum()/len(train)*100

Item_Weight        0.0
Item_Visibility    0.0
Item_MRP           0.0
dtype: float64

In [45]:
# Get the text columns. Ignore source column since it is redundant
df_text_only = df[['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type' ]].copy()

In [46]:
df_text_only.head()

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1
1,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2
2,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1
3,Regular,Fruits and Vegetables,OUT010,Small,Tier 3,Grocery Store
4,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1


In [47]:
# Apply One hot encoding
df_text_only_one_hot = pd.get_dummies(df_text_only, dtype=int)

In [48]:
df_text_only_one_hot.head()

Unnamed: 0,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,1,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [49]:
list(df_text_only_one_hot)

['Item_Fat_Content_Low Fat',
 'Item_Fat_Content_Regular',
 'Item_Type_Baking Goods',
 'Item_Type_Breads',
 'Item_Type_Breakfast',
 'Item_Type_Canned',
 'Item_Type_Dairy',
 'Item_Type_Frozen Foods',
 'Item_Type_Fruits and Vegetables',
 'Item_Type_Hard Drinks',
 'Item_Type_Health and Hygiene',
 'Item_Type_Household',
 'Item_Type_Meat',
 'Item_Type_Others',
 'Item_Type_Seafood',
 'Item_Type_Snack Foods',
 'Item_Type_Soft Drinks',
 'Item_Type_Starchy Foods',
 'Outlet_Identifier_OUT010',
 'Outlet_Identifier_OUT013',
 'Outlet_Identifier_OUT017',
 'Outlet_Identifier_OUT018',
 'Outlet_Identifier_OUT019',
 'Outlet_Identifier_OUT027',
 'Outlet_Identifier_OUT035',
 'Outlet_Identifier_OUT045',
 'Outlet_Identifier_OUT046',
 'Outlet_Identifier_OUT049',
 'Outlet_Size_High',
 'Outlet_Size_Medium',
 'Outlet_Size_Small',
 'Outlet_Location_Type_Tier 1',
 'Outlet_Location_Type_Tier 2',
 'Outlet_Location_Type_Tier 3',
 'Outlet_Type_Grocery Store',
 'Outlet_Type_Supermarket Type1',
 'Outlet_Type_Supermarket

In [59]:
df.reset_index(inplace=True, drop=True)

In [61]:
Item_ID_n_source_Item_Outlet_Sales.reset_index(inplace=True, drop=True)

In [64]:
df_text_only_one_hot.reset_index(inplace=True, drop=True)

In [65]:
data_preprocessed = pd.concat([standardized_df, df_text_only_one_hot, Item_ID_n_source_Item_Outlet_Sales], axis=1)

In [66]:
data_preprocessed.head()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,source,Item_Outlet_Sales
0,0,-0.751014,-0.969852,1.752511,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDA15,train,3735.138
1,1,-1.477653,-0.907063,-1.493696,0,1,0,0,0,0,...,0,0,1,0,0,1,0,DRC01,train,443.4228
2,2,1.011839,-0.956,0.009874,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDN15,train,2097.27
3,3,1.377308,-1.281712,0.661838,0,1,0,0,0,0,...,0,0,1,1,0,0,0,FDX07,train,732.38
4,4,-0.830557,-1.281712,-1.403623,1,0,0,0,0,0,...,0,0,1,0,1,0,0,NCD19,train,994.7052


In [67]:
data_preprocessed.tail()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,source,Item_Outlet_Sales
14199,14199,-0.493036,-1.019425,0.005,0,1,0,0,0,0,...,1,0,0,0,1,0,0,FDB58,test,0.0
14200,14200,-1.116483,1.497142,0.453249,0,1,0,0,0,0,...,0,0,1,0,0,1,0,FDD47,test,0.0
14201,14201,-0.600527,0.147226,-0.358558,1,0,0,0,0,0,...,0,1,0,0,1,0,0,NCO17,test,0.0
14202,14202,0.538878,-1.281712,1.185747,0,1,0,0,0,1,...,0,1,0,0,1,0,0,FDJ26,test,0.0
14203,14203,-0.708018,0.753397,-0.985894,0,1,0,0,0,1,...,0,1,0,0,1,0,0,FDU37,test,0.0


In [68]:
data_preprocessed.shape

(14204, 45)

In [80]:
df_outlet_ID = df[['Outlet_Identifier']].copy()

In [82]:
data_preprocessed_1 = pd.concat([data_preprocessed, df_outlet_ID], axis=1)

In [83]:
data_preprocessed_1.shape

(14204, 46)

In [84]:
data_preprocessed_1.head()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,source,Item_Outlet_Sales,Outlet_Identifier
0,0,-0.751014,-0.969852,1.752511,1,0,0,0,0,0,...,0,0,0,1,0,0,FDA15,train,3735.138,OUT049
1,1,-1.477653,-0.907063,-1.493696,0,1,0,0,0,0,...,0,1,0,0,1,0,DRC01,train,443.4228,OUT018
2,2,1.011839,-0.956,0.009874,1,0,0,0,0,0,...,0,0,0,1,0,0,FDN15,train,2097.27,OUT049
3,3,1.377308,-1.281712,0.661838,0,1,0,0,0,0,...,0,1,1,0,0,0,FDX07,train,732.38,OUT010
4,4,-0.830557,-1.281712,-1.403623,1,0,0,0,0,0,...,0,1,0,1,0,0,NCD19,train,994.7052,OUT013


In [85]:
data_preprocessed_1.tail()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,source,Item_Outlet_Sales,Outlet_Identifier
14199,14199,-0.493036,-1.019425,0.005,0,1,0,0,0,0,...,0,0,0,1,0,0,FDB58,test,0.0,OUT046
14200,14200,-1.116483,1.497142,0.453249,0,1,0,0,0,0,...,0,1,0,0,1,0,FDD47,test,0.0,OUT018
14201,14201,-0.600527,0.147226,-0.358558,1,0,0,0,0,0,...,1,0,0,1,0,0,NCO17,test,0.0,OUT045
14202,14202,0.538878,-1.281712,1.185747,0,1,0,0,0,1,...,1,0,0,1,0,0,FDJ26,test,0.0,OUT017
14203,14203,-0.708018,0.753397,-0.985894,0,1,0,0,0,1,...,1,0,0,1,0,0,FDU37,test,0.0,OUT045


In [100]:
train_preprocessed = data_preprocessed_1[data_preprocessed_1['source']=='train'].drop(columns='source')
test_preprocessed = data_preprocessed_1[data_preprocessed_1['source']=='test'].drop(columns=['source', 'Item_Outlet_Sales'])

In [101]:
train_preprocessed.to_csv("train_preprocessed.csv", index = False)
test_preprocessed.to_csv("test_preprocessed.csv", index = False)

In [107]:
train_preprocessed.head()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,Item_Outlet_Sales,Outlet_Identifier
0,0,-0.751014,-0.969852,1.752511,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDA15,3735.138,OUT049
1,1,-1.477653,-0.907063,-1.493696,0,1,0,0,0,0,...,0,0,1,0,0,1,0,DRC01,443.4228,OUT018
2,2,1.011839,-0.956,0.009874,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDN15,2097.27,OUT049
3,3,1.377308,-1.281712,0.661838,0,1,0,0,0,0,...,0,0,1,1,0,0,0,FDX07,732.38,OUT010
4,4,-0.830557,-1.281712,-1.403623,1,0,0,0,0,0,...,0,0,1,0,1,0,0,NCD19,994.7052,OUT013


In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

In [109]:
train = pd.read_csv("train_preprocessed.csv")
test = pd.read_csv("test_preprocessed.csv")

In [110]:
train.head()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,Item_Outlet_Sales,Outlet_Identifier
0,0,-0.751014,-0.969852,1.752511,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDA15,3735.138,OUT049
1,1,-1.477653,-0.907063,-1.493696,0,1,0,0,0,0,...,0,0,1,0,0,1,0,DRC01,443.4228,OUT018
2,2,1.011839,-0.956,0.009874,1,0,0,0,0,0,...,1,0,0,0,1,0,0,FDN15,2097.27,OUT049
3,3,1.377308,-1.281712,0.661838,0,1,0,0,0,0,...,0,0,1,1,0,0,0,FDX07,732.38,OUT010
4,4,-0.830557,-1.281712,-1.403623,1,0,0,0,0,0,...,0,0,1,0,1,0,0,NCD19,994.7052,OUT013


In [111]:
test.head()

Unnamed: 0,index,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Identifier,Outlet_Identifier
0,8523,1.71053,-1.134699,-0.533831,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,FDW58,OUT049
1,8524,-0.965996,-0.534917,-0.864708,0,1,0,0,0,0,...,1,0,1,0,0,1,0,0,FDW14,OUT017
2,8525,0.388391,0.653405,1.622763,1,0,0,0,0,0,...,1,0,0,1,1,0,0,0,NCN55,OUT010
3,8526,-1.177753,-0.982657,0.225966,1,0,0,0,0,0,...,1,0,1,0,0,1,0,0,FDQ58,OUT017
4,8527,0.173409,1.023121,1.501577,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,FDY38,OUT027


In [112]:
X = train.drop(columns = ["Item_Outlet_Sales", "Item_Identifier", "Outlet_Identifier"])
y = train['Item_Outlet_Sales']

In [113]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.2, random_state=32)

In [114]:
model = LinearRegression()
model

LinearRegression()

In [115]:
model.fit(train_X, train_y)

LinearRegression()

In [116]:
model.coef_

array([ 3.80911914e-03,  6.09100397e+00, -9.45702185e+00,  9.77624234e+02,
       -1.97084972e+01,  1.97084972e+01, -1.71558055e+01, -9.26708218e+00,
       -2.67947051e+01,  1.71531501e+01, -6.61182453e+01, -5.03723857e+01,
        2.98952103e+01, -1.88420520e+01, -3.10934623e+01, -9.79974893e+00,
        8.96502098e+00, -4.92409467e+00,  1.59412650e+02,  6.94500038e+00,
       -3.45043793e+01,  4.65009292e+01, -4.98664069e+02, -2.56372875e+01,
        1.03657207e+02, -1.72673745e+02, -3.53003241e+02,  6.79357044e+02,
        1.56619229e+02, -8.02651730e+01,  3.70692104e+02, -1.80082068e+02,
       -2.56372875e+01,  3.26601231e+02, -3.00963944e+02, -1.62393205e+02,
        1.80011263e+02, -1.76180572e+01, -8.51667311e+02,  3.44984011e+02,
       -1.72673745e+02,  6.79357044e+02])

In [117]:
model.intercept_

2032.4414164499258

In [119]:
train_X_pred = model.predict(train_X)
test_X_pred = model.predict(test_X)

#actual test data from hackathon
test_pred = model.predict(test.drop(columns = ["Item_Identifier", "Outlet_Identifier"]))

In [120]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [122]:
train_mean_squared_error = (train_y, train_X_pred)
test_mean_squared_error = (test_y, test_X_pred)

In [123]:
train_mean_squared_error

(2731     525.9820
 7183    2116.5782
 3109    1004.0264
 2552    1869.5664
 7006    3904.9170
           ...    
 4606    4660.6000
 3143    1845.5976
 8444     482.0392
 7571    1052.6298
 4030    1318.2840
 Name: Item_Outlet_Sales, Length: 6818, dtype: float64,
 array([1592.54049232, 3175.74892271,  626.4780937 , ..., 3346.50851417,
        1104.80171548, 1872.27998515]))

In [124]:
test_mean_squared_error

(4083    1216.4166
 3536    5452.9020
 6222    1282.3308
 6514    2376.9060
 147     1597.9200
           ...    
 851      547.2876
 735     1751.0540
 4703    1901.5248
 1482    4138.6128
 6931    5634.6654
 Name: Item_Outlet_Sales, Length: 1705, dtype: float64,
 array([4316.59072497, 3319.26071065, 2308.05454723, ..., 2584.69524943,
        3193.34227206, 2929.88519038]))

In [125]:
train_mean_absolute_error = (train_y, train_X_pred)
test_mean_absolute_error = (test_y, test_X_pred)

In [126]:
train_mean_absolute_error

(2731     525.9820
 7183    2116.5782
 3109    1004.0264
 2552    1869.5664
 7006    3904.9170
           ...    
 4606    4660.6000
 3143    1845.5976
 8444     482.0392
 7571    1052.6298
 4030    1318.2840
 Name: Item_Outlet_Sales, Length: 6818, dtype: float64,
 array([1592.54049232, 3175.74892271,  626.4780937 , ..., 3346.50851417,
        1104.80171548, 1872.27998515]))

In [127]:
test_mean_absolute_error

(4083    1216.4166
 3536    5452.9020
 6222    1282.3308
 6514    2376.9060
 147     1597.9200
           ...    
 851      547.2876
 735     1751.0540
 4703    1901.5248
 1482    4138.6128
 6931    5634.6654
 Name: Item_Outlet_Sales, Length: 1705, dtype: float64,
 array([4316.59072497, 3319.26071065, 2308.05454723, ..., 2584.69524943,
        3193.34227206, 2929.88519038]))

In [128]:
train_r2_score = r2_score(train_y, train_X_pred)
test_r2_score = r2_score(test_y, test_X_pred)

In [129]:
train_r2_score

0.5645510660727269

In [130]:
test_r2_score

0.5590382901601723

In [131]:
test_pred_modified = np.where(test_pred<0,0,test_pred)

In [132]:
test['Item_Outlet_Sales'] = test_pred_modified

In [133]:
Output = test[[ "Item_Identifier", "Outlet_Identifier","Item_Outlet_Sales"]]

In [136]:
Output.to_csv("E:\Big_Mart_Linear_Regression.csv", index = False)

In [137]:
Output.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1880.516418
1,FDW14,OUT017,1500.004553
2,NCN55,OUT010,1954.006784
3,FDQ58,OUT017,2602.871835
4,FDY38,OUT027,5145.567693
