# Streamlit for Product Prediction
Part A (Core)

*Christina Brockway*

#### Load Data and Imports

In [1]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot  as plt
import seaborn as sns
pd.set_option('display.max_columns',100)

In [2]:
# Load data
fpath = 'data/sales_predictions_2023.csv'
df = pd.read_csv(fpath)
df.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228


#### Clean data

In [3]:
df=df.drop(columns = ['Item_Identifier', 'Outlet_Identifier',
                      'Outlet_Establishment_Year'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           7060 non-null   float64
 1   Item_Fat_Content      8523 non-null   object 
 2   Item_Visibility       8523 non-null   float64
 3   Item_Type             8523 non-null   object 
 4   Item_MRP              8523 non-null   float64
 5   Outlet_Size           6113 non-null   object 
 6   Outlet_Location_Type  8523 non-null   object 
 7   Outlet_Type           8523 non-null   object 
 8   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(4), object(5)
memory usage: 599.4+ KB


In [4]:
df.duplicated().sum()

0

In [5]:
df.nunique()

Item_Weight              415
Item_Fat_Content           5
Item_Visibility         7880
Item_Type                 16
Item_MRP                5938
Outlet_Size                3
Outlet_Location_Type       3
Outlet_Type                4
Item_Outlet_Sales       3493
dtype: int64

In [6]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [7]:
df['Item_Fat_Content']=df['Item_Fat_Content'].replace({
    'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [8]:
df.isna().sum()

Item_Weight             1463
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
Item_Outlet_Sales          0
dtype: int64

In [9]:
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4650 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           4650 non-null   float64
 1   Item_Fat_Content      4650 non-null   object 
 2   Item_Visibility       4650 non-null   float64
 3   Item_Type             4650 non-null   object 
 4   Item_MRP              4650 non-null   float64
 5   Outlet_Size           4650 non-null   object 
 6   Outlet_Location_Type  4650 non-null   object 
 7   Outlet_Type           4650 non-null   object 
 8   Item_Outlet_Sales     4650 non-null   float64
dtypes: float64(4), object(5)
memory usage: 363.3+ KB


In [10]:
#save the cleaned file
path = 'data/sales_prediction_clean.csv'
df.to_csv(path, index=False)

#### Testing Functions

In [11]:
## Function to load data
def load_data():
    df=pd.read_csv('data/sales_prediction_clean.csv')
    return df

In [12]:
df= load_data()
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,Medium,Tier 1,Supermarket Type1,2097.27
3,8.93,Low Fat,0.0,Household,53.8614,High,Tier 3,Supermarket Type1,994.7052
4,10.395,Regular,0.0,Baking Goods,51.4008,Medium,Tier 3,Supermarket Type2,556.6088


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4650 entries, 0 to 4649
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           4650 non-null   float64
 1   Item_Fat_Content      4650 non-null   object 
 2   Item_Visibility       4650 non-null   float64
 3   Item_Type             4650 non-null   object 
 4   Item_MRP              4650 non-null   float64
 5   Outlet_Size           4650 non-null   object 
 6   Outlet_Location_Type  4650 non-null   object 
 7   Outlet_Type           4650 non-null   object 
 8   Item_Outlet_Sales     4650 non-null   float64
dtypes: float64(4), object(5)
memory usage: 327.1+ KB
