# <center><u> Streamlit for Project 1 Part A (Core)
- Authored By: Eric N. Valdez
- Date: 4/14/2024

- ### For this assignment, you will start making a streamlit dashboard for your Project 1 Sales Prediction Dataset. This is the link to the [original dataset](https://drive.google.com/file/d/1syH81TVrbBsdymLT_jl2JIf6IjPXtSQw/view) we used in course 1.
- `Note` Remember to change your app settings to Run on Save
    - Correct inconsistencies in the Item_Fat_Content column
    - Drop Item_Identifier, Outlet_Identifier, and Outlet_Establishment_Year

# Imports: 

In [1]:
# Imports
import seaborn as sns
# Missingno
import missingno as msno
# Matplotlib
import matplotlib.pyplot as plt
#Seaborn
import seaborn as sns
#Pandas
import pandas as pd
# Numpy
import numpy as np
#Warnings
import warnings
# Set filter warnings to ignore
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',100)


In [2]:
# Loading Data
product_sales = ('Data/sales_predictions_2023.csv')
df = pd.read_csv(product_sales)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
# Drop Item_Identifier, Outlet_Identifier, and Outlet_Establishment_Year
df.drop(columns=['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], inplace=True)

In [6]:
df.isna().sum()

Item_Weight             1463
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
Item_Outlet_Sales          0
dtype: int64

In [7]:
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,,Tier 3,Grocery Store,732.3800
7,,Low Fat,0.127470,Snack Foods,107.7622,Medium,Tier 3,Supermarket Type3,4022.7636
8,16.20,Regular,0.016687,Frozen Foods,96.9726,,Tier 2,Supermarket Type1,1076.5986
9,19.20,Regular,0.094450,Frozen Foods,187.8214,,Tier 2,Supermarket Type1,4710.5350
18,,Low Fat,0.034238,Hard Drinks,113.2834,Medium,Tier 3,Supermarket Type3,2303.6680
...,...,...,...,...,...,...,...,...,...
8504,,Low Fat,0.124111,Household,111.7544,Medium,Tier 3,Supermarket Type3,4138.6128
8508,11.35,Regular,0.043246,Fruits and Vegetables,199.4742,,Tier 2,Supermarket Type1,2587.9646
8509,8.10,Low Fat,0.214306,Fruits and Vegetables,213.9902,,Tier 3,Grocery Store,424.7804
8514,15.00,Regular,0.054489,Canned,57.5904,,Tier 2,Supermarket Type1,468.7232


In [8]:
# # Drop rows with missing values
# df.dropna(inplace=True)

In [9]:
# df.isna().sum()

In [10]:
# Correct inconsistencies in the Item_Fat_Content column
column_data = df['Item_Fat_Content']
column_data

0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
         ...   
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, Length: 8523, dtype: object

In [11]:
print(df['Item_Fat_Content'].head())

0    Low Fat
1    Regular
2    Low Fat
3    Regular
4    Low Fat
Name: Item_Fat_Content, dtype: object


In [12]:
print(df['Item_Fat_Content'].tail())

8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, dtype: object


In [13]:
print(df['Item_Fat_Content'].describe())

count        8523
unique          5
top       Low Fat
freq         5089
Name: Item_Fat_Content, dtype: object


In [14]:
print(df['Item_Fat_Content'].unique())

['Low Fat' 'Regular' 'low fat' 'LF' 'reg']


In [15]:
# Replace 'reg' with 'Regular, 'LF' & 'low fat' with "Low Fat"
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':"Low Fat", 'low fat': "Low Fat", 'reg':"Regular" })
df['Item_Fat_Content'].describe()

count        8523
unique          2
top       Low Fat
freq         5517
Name: Item_Fat_Content, dtype: object

In [16]:
print(df['Item_Fat_Content'].unique())

['Low Fat' 'Regular']


In [17]:
# Checking Value Counts
df['Item_Fat_Content'].value_counts

<bound method IndexOpsMixin.value_counts of 0       Low Fat
1       Regular
2       Low Fat
3       Regular
4       Low Fat
         ...   
8518    Low Fat
8519    Regular
8520    Low Fat
8521    Regular
8522    Low Fat
Name: Item_Fat_Content, Length: 8523, dtype: object>

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           7060 non-null   float64
 1   Item_Fat_Content      8523 non-null   object 
 2   Item_Visibility       8523 non-null   float64
 3   Item_Type             8523 non-null   object 
 4   Item_MRP              8523 non-null   float64
 5   Outlet_Size           6113 non-null   object 
 6   Outlet_Location_Type  8523 non-null   object 
 7   Outlet_Type           8523 non-null   object 
 8   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(4), object(5)
memory usage: 599.4+ KB


In [19]:
### Interactive Pandas dataframe of the prepared dataset from above
#Defining columns to use
columns_to_use = ['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',\
                  'Item_MRP', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
df = df[columns_to_use]
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,High,Tier 3,Supermarket Type1,994.7052


In [20]:
### A button to trigger the display of a dataframe of Descriptive Statistics
# Obtain summary statistics
df.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0
mean,12.86,0.07,140.99,2181.29
std,4.64,0.05,62.28,1706.5
min,4.56,0.0,31.29,33.29
25%,8.77,0.03,93.83,834.25
50%,12.6,0.05,143.01,1794.33
75%,16.85,0.09,185.64,3101.3
max,21.35,0.33,266.89,13086.96


In [21]:
### A button to trigger the display of the Null values 
# Displaying null values
nulls =df.isna().sum()
nulls

Item_Weight             1463
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
Item_Outlet_Sales          0
dtype: int64

In [24]:
# Check data type
type(nulls)

pandas.core.series.Series

In [22]:
### A button to trigger the display of the summary information (the output of .info)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           7060 non-null   float64
 1   Item_Fat_Content      8523 non-null   object 
 2   Item_Visibility       8523 non-null   float64
 3   Item_Type             8523 non-null   object 
 4   Item_MRP              8523 non-null   float64
 5   Outlet_Size           6113 non-null   object 
 6   Outlet_Location_Type  8523 non-null   object 
 7   Outlet_Type           8523 non-null   object 
 8   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(4), object(5)
memory usage: 599.4+ KB


In [23]:
#Saving new data frame
df.to_csv('Data/item_fat_content.csv', index=False)

# Stremlit App

## [Project 1 Part A](https://part1apy-nuzgqglnhhw3pf4vc4zt9b.streamlit.app/)