In [2]:
# Installing opendatasets library

!pip install jovian opendatasets --upgrade --quiet


[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
#Importing the required libraries

import opendatasets as od
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation

In [4]:
od.download("https://www.kaggle.com/datasets/thedevastator/bigmart-product-sales-factors")

Skipping, found downloaded files in ".\bigmart-product-sales-factors" (use force=True to force download)


In [5]:
bigmart_df=pd.read_csv("bigmart-product-sales-factors/data.csv")
bigmart_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138000
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.422800
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.270000
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.380000
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.705200
...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,2213.476465
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,2349.547110
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1,1892.981544
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,4533.564958


In [6]:
bigmart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                11765 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          14204 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.3+ MB


In [7]:
bigmart_df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [8]:
bigmart_df.shape

(14204, 12)

## Data Preparation and Cleaning

In [9]:
itemWeight_Mean=bigmart_df['Item_Weight'].mean()
itemWeight_Mean

12.792854228644284

In [10]:
bigmart_df['Item_Weight']=bigmart_df['Item_Weight'].fillna(itemWeight_Mean)

In [11]:
bigmart_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138000
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.422800
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.270000
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.380000
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.705200
...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,2213.476465
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,2349.547110
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1,1892.981544
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,4533.564958


In [12]:
bigmart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          14204 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.3+ MB


In [13]:
missing_columns = bigmart_df.columns[bigmart_df.isnull().any()].tolist()
impute_data = bigmart_df[missing_columns]

In [14]:
no_missing_data = bigmart_df.drop(missing_columns, axis = 1)

In [15]:
knn_imputer = KNNImputer(n_neighbors = 5)

In [16]:
imputed_data=pd.DataFrame()
for column in missing_columns:
    if impute_data[column].isnull().any():
        impute_data_numeric = pd.get_dummies(impute_data[column])
        
        imputed_data_numeric = pd.DataFrame(knn_imputer.fit_transform(impute_data_numeric), columns=impute_data_numeric.columns)
        
        imputed_data[column] = imputed_data_numeric.idxmax(axis=1)
    else:
        imputed_data[column] = impute_data[column]

filled_data = pd.concat([no_missing_data, imputed_data], axis=1)

In [17]:
new_bigmart_df=filled_data

In [18]:
new_bigmart_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Size
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,Supermarket Type1,3735.138000,Medium
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,Supermarket Type2,443.422800,Medium
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Tier 1,Supermarket Type1,2097.270000,Medium
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Tier 3,Grocery Store,732.380000,High
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,Tier 3,Supermarket Type1,994.705200,High
...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Tier 1,Supermarket Type1,2213.476465,Small
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Tier 3,Supermarket Type2,2349.547110,Medium
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,Tier 2,Supermarket Type1,1892.981544,High
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,Tier 2,Supermarket Type1,4533.564958,High


In [19]:
columns = new_bigmart_df.columns.tolist()
columns.remove("Outlet_Size")

target_index = columns.index("Outlet_Establishment_Year")
columns.insert(target_index + 1, "Outlet_Size")

new_bigmart_df=new_bigmart_df[columns]

In [20]:
new_bigmart_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138000
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.422800
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.270000
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,High,Tier 3,Grocery Store,732.380000
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.705200
...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,2213.476465
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,2349.547110
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,High,Tier 2,Supermarket Type1,1892.981544
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,High,Tier 2,Supermarket Type1,4533.564958


In [21]:
new_bigmart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                14204 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          14204 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.3+ MB


In [22]:
new_bigmart_df=new_bigmart_df.round(2)

In [23]:
new_bigmart_df.sample(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
13268,FDT04,12.79,Low Fat,0.11,Frozen Foods,38.18,OUT027,1985,Medium,Tier 3,Supermarket Type3,904.43
4490,FDW48,18.0,Low Fat,0.01,Baking Goods,81.26,OUT035,2004,Small,Tier 2,Supermarket Type1,402.81
10782,NCW30,5.21,Low Fat,0.01,Household,258.8,OUT045,2002,High,Tier 2,Supermarket Type1,3181.06
12070,DRC24,17.85,Low Fat,0.02,Soft Drinks,151.8,OUT049,1999,Medium,Tier 1,Supermarket Type1,2297.4
12125,NCW54,12.79,Low Fat,0.1,Household,57.86,OUT027,1985,Medium,Tier 3,Supermarket Type3,1765.03
9809,FDT01,13.65,Regular,0.18,Canned,214.09,OUT035,2004,Small,Tier 2,Supermarket Type1,3374.93
4460,FDM01,7.9,Regular,0.09,Breakfast,100.73,OUT013,1987,High,Tier 3,Supermarket Type1,1230.4
12654,FDZ03,13.65,Regular,0.08,Dairy,187.62,OUT013,1987,High,Tier 3,Supermarket Type1,3228.2
5289,FDM04,9.2,Regular,0.05,Frozen Foods,53.17,OUT046,1997,Small,Tier 1,Supermarket Type1,769.0
9631,FDT50,6.75,Regular,0.18,Dairy,97.38,OUT010,1998,High,Tier 3,Grocery Store,184.8


In [24]:
new_bigmart_df["Item_Fat_Content"].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [25]:
new_bigmart_df["Item_Fat_Content"]=new_bigmart_df["Item_Fat_Content"].replace(['low fat','LF'],'Low Fat')
new_bigmart_df["Item_Fat_Content"]=new_bigmart_df["Item_Fat_Content"].replace(['reg'],'Regular')

In [26]:
new_bigmart_df.sample(20)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
4119,NCA06,20.5,Low Fat,0.14,Household,37.22,OUT045,2002,High,Tier 2,Supermarket Type1,476.05
825,FDF46,7.07,Low Fat,0.09,Snack Foods,113.38,OUT035,2004,Small,Tier 2,Supermarket Type1,1267.02
10247,FDQ60,12.79,Regular,0.11,Baking Goods,118.51,OUT027,1985,Medium,Tier 3,Supermarket Type3,2997.6
1321,NCY41,16.75,Low Fat,0.08,Health and Hygiene,37.65,OUT045,2002,High,Tier 2,Supermarket Type1,575.25
12814,FDY45,17.5,Low Fat,0.04,Snack Foods,252.94,OUT010,1998,High,Tier 3,Grocery Store,399.22
391,FDY32,7.6,Low Fat,0.13,Fruits and Vegetables,164.02,OUT045,2002,High,Tier 2,Supermarket Type1,3914.9
3367,FDT10,16.7,Regular,0.06,Snack Foods,60.96,OUT013,1987,High,Tier 3,Supermarket Type1,355.54
10670,NCG07,12.3,Low Fat,0.05,Household,188.65,OUT049,1999,Medium,Tier 1,Supermarket Type1,3068.58
5244,FDT37,14.15,Low Fat,0.04,Canned,255.5,OUT013,1987,High,Tier 3,Supermarket Type1,5355.03
14102,NCI29,8.6,Low Fat,0.03,Health and Hygiene,140.62,OUT013,1987,High,Tier 3,Supermarket Type1,1559.11
