<a href="https://colab.research.google.com/github/Souha-Kabtni/Food-Sales-Predictions/blob/main/Project_1_Part_5_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Upload the file
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import my to be used packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_columns', 100)


In [3]:
# Read the fil with pandas
df = pd.read_csv('/content/drive/MyDrive/Coding_Dojo/Stack 2/Core assignments/sales_predictions_2023.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## **Data Pre-processing**

In [4]:
# It is recommended to drop the 'Item_Identifier' column due to high cardinality, (1559 is a high number of unique labels or classes that can negatively impact the models' metrics, and greatly increase processing times).

df = df.drop(columns=['Item_Identifier'])
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
df[df.duplicated()]

# ➿ There are no duplicate rows :)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [6]:
(df.isnull().sum()/(len(df)))*100

# ➿ There is a number of missing values (at the level of Item_Weight and Outlet_Size) that, since we are to perform Machine Learning, will be treated later on :)

Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

### ❌ Finding and Fixing any inconsistent categories of data

In [7]:
# Locate my Object columns
df_types = df.dtypes
df_types

Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [8]:
# Extract my Object columns only
str_cols = df_types[df_types=="O"]
str_cols

Item_Fat_Content        object
Item_Type               object
Outlet_Identifier       object
Outlet_Size             object
Outlet_Location_Type    object
Outlet_Type             object
dtype: object

In [9]:
# find the index of each
str_cols.index

Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
       'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [10]:
df[str_cols.index].head()

Unnamed: 0,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1
1,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2
2,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1
3,Regular,Fruits and Vegetables,OUT010,,Tier 3,Grocery Store
4,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1


In [11]:
str_cols = str_cols.index

In [12]:
str_cols

Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
       'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [13]:
for col in str_cols:
  print(f"{col}:")
  print(df[col].value_counts(dropna=False))
  print("\n\n")

Item_Fat_Content:
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Item_Type:
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



Outlet_Identifier:
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: Outlet_Identifier, dtype: int64



Outlet_Size:
Medium    2793
NaN       2410
Small     2388
High       932
Name: Outlet_Size, dtype: int64



Outlet_Location_Type:
Tier 3  

**❎ Found that Item_Fat_Content column contains both**

1. 'Low Fat' and 'LF', which seems to be the same, yet written differently
2. 'Regula' and 'reg', which seems to be the same, yet written differently

✅ Replacinng both Low Fat and 'LF' by 'Low_Fat' and 'reg' by 'Regular'

In [14]:
df.replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg':'Regular'},inplace=True)

In [15]:
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Identifier          8523 non-null   object 
 6   Outlet_Establishment_Year  8523 non-null   int64  
 7   Outlet_Size                6113 non-null   object 
 8   Outlet_Location_Type       8523 non-null   object 
 9   Outlet_Type                8523 non-null   object 
 10  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 732.6+ KB


✅ The dataset is of shape: 8523 row and 12 columns

✅ There are a mixture of datatypes:

6 of type object (having dropped tje Item_Identifier column to its high cardinality)

4 of type float

1 of type integer

### **With Item_Outlet_Sales being the target value to prodict my our model, the split will be as such: (y) variable is Item_Outlet_Sales and the remaining columns are the features (X)**

In [17]:
y = df['Item_Outlet_Sales']
X = df.drop(columns='Item_Outlet_Sales')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
# For Numerical Feartures

# Defining a list of features

numerical_cols = X_train.select_dtypes("number").drop(columns='Outlet_Establishment_Year').columns
ordinal_cols = ['Outlet_Size']
nominal_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type']


# Instantiating the Transformers

impute_mean = SimpleImputer()
impute_most_frequent = SimpleImputer(strategy='most_frequent')
Outlet_Size_order = ['Small', 'Medium', 'High']
ordinal_encoder = OrdinalEncoder(categories=[Outlet_Size_order])
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()


# Instantiating the pipeline

numerical_pipe = make_pipeline(impute_mean, scaler)
ordinal_pipe = make_pipeline(impute_most_frequent, ordinal_encoder, scaler)
nominal_pipe = make_pipeline(ohe_encoder)


# Defining a tuple for each pathway

numerical_tup = ('Numerical', numerical_pipe, numerical_cols)
ordinal_tup = ('Ordinal', ordinal_pipe, ordinal_cols)
nominal_tup = ('Nominal', nominal_pipe, nominal_cols)


# Instantiating the ColumnTransformer

col_transformer = ColumnTransformer([numerical_tup, ordinal_tup, nominal_tup], remainder='passthrough', verbose_feature_names_out=False)
col_transformer

In [19]:
# Fitting the ColumnTransformer on the training data only

col_transformer.fit(X_train)

In [20]:
# Transform the training data

X_train_processed = col_transformer.transform(X_train)

# Transform the testing data

X_test_processed = col_transformer.transform(X_test)

In [21]:
X_train_processed.info()
X_train_processed.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 40 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Item_Weight                      6392 non-null   float64
 1   Item_Visibility                  6392 non-null   float64
 2   Item_MRP                         6392 non-null   float64
 3   Outlet_Size                      6392 non-null   float64
 4   Item_Fat_Content_Low Fat         6392 non-null   float64
 5   Item_Fat_Content_Regular         6392 non-null   float64
 6   Item_Type_Baking Goods           6392 non-null   float64
 7   Item_Type_Breads                 6392 non-null   float64
 8   Item_Type_Breakfast              6392 non-null   float64
 9   Item_Type_Canned                 6392 non-null   float64
 10  Item_Type_Dairy                  6392 non-null   float64
 11  Item_Type_Frozen Foods           6392 non-null   float64
 12  Item_Type_Fruits 

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Establishment_Year
count,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0
mean,4.457566e-16,-6.16945e-17,4.668773e-17,1.2366690000000001e-17,0.645964,0.354036,0.074781,0.027378,0.013141,0.07525,0.079318,0.098874,0.14831,0.026439,0.061014,0.10873,0.047247,0.020338,0.006571,0.14174,0.051783,0.019086,0.064925,0.107791,0.109199,0.110138,0.060075,0.11311,0.11092,0.109355,0.10873,0.105757,0.274562,0.329474,0.395964,0.125,0.651752,0.110138,0.11311,1997.857165
std,1.000078,1.000078,1.000078,1.000078,0.478258,0.478258,0.263058,0.163195,0.113889,0.263815,0.270255,0.298516,0.355435,0.16045,0.239374,0.311324,0.212182,0.141164,0.0808,0.34881,0.221607,0.136839,0.246413,0.310141,0.311913,0.313086,0.237645,0.316752,0.314057,0.312109,0.311324,0.307551,0.446328,0.470059,0.489095,0.330745,0.476452,0.313086,0.316752,8.3923
min,-1.980409,-1.291052,-1.767529,-1.384048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1985.0
25%,-0.8075039,-0.7624234,-0.7638272,-1.384048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1987.0
50%,4.213344e-16,-0.2318711,0.03400912,0.2873737,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1999.0
75%,0.7579511,0.5596016,0.717291,0.2873737,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,2004.0
max,2.003199,5.13205,1.994559,1.958796,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2009.0


## **✅ As per .info(), Outlet_size column is now of a numeric dtypes**
## **✅ As per .describe(), the original numeric features (i.e. all columns, except Item_Weight that was indeed One-hot-encoded) have been scaled**

In [22]:
(X_train_processed.isnull().sum()/(len(df)))*100

# ## **✅ As per .isna(), there are no null values remaining in the train set**

Item_Weight                        0.0
Item_Visibility                    0.0
Item_MRP                           0.0
Outlet_Size                        0.0
Item_Fat_Content_Low Fat           0.0
Item_Fat_Content_Regular           0.0
Item_Type_Baking Goods             0.0
Item_Type_Breads                   0.0
Item_Type_Breakfast                0.0
Item_Type_Canned                   0.0
Item_Type_Dairy                    0.0
Item_Type_Frozen Foods             0.0
Item_Type_Fruits and Vegetables    0.0
Item_Type_Hard Drinks              0.0
Item_Type_Health and Hygiene       0.0
Item_Type_Household                0.0
Item_Type_Meat                     0.0
Item_Type_Others                   0.0
Item_Type_Seafood                  0.0
Item_Type_Snack Foods              0.0
Item_Type_Soft Drinks              0.0
Item_Type_Starchy Foods            0.0
Outlet_Identifier_OUT010           0.0
Outlet_Identifier_OUT013           0.0
Outlet_Identifier_OUT017           0.0
Outlet_Identifier_OUT018 

In [23]:
(X_test_processed.isnull().sum()/(len(df)))*100

# ## **✅ As per .isna(), there are no null values remaining in the test set**

Item_Weight                        0.0
Item_Visibility                    0.0
Item_MRP                           0.0
Outlet_Size                        0.0
Item_Fat_Content_Low Fat           0.0
Item_Fat_Content_Regular           0.0
Item_Type_Baking Goods             0.0
Item_Type_Breads                   0.0
Item_Type_Breakfast                0.0
Item_Type_Canned                   0.0
Item_Type_Dairy                    0.0
Item_Type_Frozen Foods             0.0
Item_Type_Fruits and Vegetables    0.0
Item_Type_Hard Drinks              0.0
Item_Type_Health and Hygiene       0.0
Item_Type_Household                0.0
Item_Type_Meat                     0.0
Item_Type_Others                   0.0
Item_Type_Seafood                  0.0
Item_Type_Snack Foods              0.0
Item_Type_Soft Drinks              0.0
Item_Type_Starchy Foods            0.0
Outlet_Identifier_OUT010           0.0
Outlet_Identifier_OUT013           0.0
Outlet_Identifier_OUT017           0.0
Outlet_Identifier_OUT018 