<a href="https://colab.research.google.com/github/Souha-Kabtni/Food-Sales-Predictions/blob/main/Project_1_Part_5_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1st: upload the file
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 2nd: Import my to be used packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
# 3rd: Read the fil with pandas
df = pd.read_csv('/content/drive/MyDrive/Coding_Dojo/Stack 2/Core assignments/sales_predictions_2023.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## 1/ Before splitting your data, you can drop duplicates and fix inconsistencies in categorical data.* (*There is a way to do this after the split, but for this **project, you may perform this step before the split)**

In [4]:
df[df.duplicated()]
# There are no duplicate rows :)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [5]:
(df.isnull().sum()/(len(df)))*100
# There is a number of missing values that, since we are to perform Machine Learning, will be treated later on :)

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

### **❌ Find and fix any inconsistent categories of data (example: fix cat, Cat, and cats so that they are consistent)**

In [6]:
# Locate my Object columns
df_types = df.dtypes
df_types

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [7]:
# Extract my Object columns only
str_cols = df_types[df_types=="O"]
str_cols

Item_Identifier         object
Item_Fat_Content        object
Item_Type               object
Outlet_Identifier       object
Outlet_Size             object
Outlet_Location_Type    object
Outlet_Type             object
dtype: object

In [8]:
# find the index of each
str_cols.index

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [9]:
df[str_cols.index].head()

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1
1,DRC01,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2
2,FDN15,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1
3,FDX07,Regular,Fruits and Vegetables,OUT010,,Tier 3,Grocery Store
4,NCD19,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1


In [10]:
str_cols = str_cols.index

In [11]:
str_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [12]:
for col in str_cols:
  print(f"{col}:")
  print(df[col].value_counts(dropna=False))
  print("\n\n")

Item_Identifier:
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64



Item_Fat_Content:
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Item_Type:
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



Outlet_Identifier:
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT01

**❎ Found that Item_Fat_Content column contains both**

1. 'Low Fat' and 'LF', which seems to be the same, yet written differently YET read as different values by the program
2. 'Regula' and 'reg', which seems to be the same, yet written differently YET read as different values by the program

=> Replacinng both Low Fat and 'LF' by 'Low_Fat' and 'reg' by 'Regular'

In [13]:
df.replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg':'Regular'},inplace=True)

In [14]:
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


✅ The dataset is of shape: 8523 row and 12 columns

✅ There are a mixture of datatypes:

7 of type object

4 of type float

1 of type integer

# 2/ Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix.

In [16]:
y = df['Item_Outlet_Sales']               # Y includes the to-be-predicted variable rating
X = df.drop(columns='Item_Outlet_Sales')  # X includes every feature except: "name." and the target variable rating

# 3/ Perform a train test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Since the instuction did not specify the % of Train-test set => I will use the by-default splitting: 75%-Train and 25%-Test

# 4/ Create a preprocessing object to prepare the dataset for Machine Learning

In [18]:
(X.isnull().sum()/(len(df)))*100
# There are no missing values :)

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
dtype: float64

In [19]:
numerical_col = ['Item_Weight']  # Bases on .isna(), Item_Weight (numeric column) has missing values that need to be imputed

impute_median = SimpleImputer(strategy = 'median')  # 2/ a/ Impute null values using SImpleImputer using the 'median' strategy
# Since the feature 'Item_Weight' is the one where imputation is to take place, it is rather skewed to the left (not normally distributed with the mean (12.85)> median (12.6)), I will impute using the strategy 'median'

numerical_pipe = make_pipeline(impute_median)  # 3/ Instantiate the Pipeline Using Transformers

num_tuple = ('numeric', numerical_pipe, numerical_col)  # 4/ Create a tuple for each transformer with the: name, the transformer object, and the list of columns.

In [20]:
categorical_col = ['Outlet_Size']  # Bases on .isna(), Outlet_Size (categorical column) has missing values that need to be imputed
impute_most_frequent = SimpleImputer(strategy='most_frequent')  # 2/ a/ Impute null values using SimpleImputer using the ‘constant’ strategy with a fill value of "MISSING

Outlet_Size_order = ['Small', 'Medium', 'High']   # Specifying order of categories for my Ordinal shelf Columm
ordinal_category_orders = [Outlet_Size_order]   # Making the list of order lists for OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories=ordinal_category_orders)  # 2/ c/ Ordinal encode the ordinal features using OrdinalEncoder

scaler_ordinal = StandardScaler()  # 2/ c/ Scale the ordinal features using StandardScaler

categorical_pipe = make_pipeline(impute_most_frequent, ordinal_encoder, scaler_ordinal)  # 3/ Instantiate the Pipeline Using Transformers

categorical_tuple = ('categorical', categorical_pipe, categorical_col)  # 4/ Create a tuple for each transformer with the: name, the transformer object, and the list of columns.

In [21]:
col_transformer = ColumnTransformer([num_tuple, categorical_tuple], verbose_feature_names_out=False)  # 5. Use the tuples to create a ColumnTransformer to preprocess the data (Instantiate the ColumnTransformer). Make sure to set verbose_feature_names_out to False!
col_transformer

In [22]:
col_transformer.fit(X_train)  # Fit the ColumnTransformer on your training data

In [23]:
X_train_processed = col_transformer.transform(X_train)  # Transform the training data

X_test_processed = col_transformer.transform(X_test)  # Transform the testing data

In [24]:
# Transform the training and test data and save them as new variables, named appropriately (e.g., X_train_tf/X_train_processed, X_test_tf/X_test_processed

X_train_processed.info()
X_train_processed.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Item_Weight  6392 non-null   float64
 1   Outlet_Size  6392 non-null   float64
dtypes: float64(2)
memory usage: 149.8 KB


Unnamed: 0,Item_Weight,Outlet_Size
count,6392.0,6392.0
mean,12.86039,1.2366690000000001e-17
std,4.217456,1.000078
min,4.555,-1.384048
25%,9.5,-1.384048
50%,12.65,0.2873737
75%,16.1,0.2873737
max,21.35,1.958796


## **✅ As per .info(), Outlet_size column is now of a numeric dtypes**
## **✅ As per .describe(), the original numeric features (i.e. all columns, except Item_Weight that was indeed One-hot-encoded) have been scaled**