# Import Packages

In [12]:
import pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(display='diagram')



# Reloading in the Data

In [2]:
df = pd.read_csv('Data/sales_predictions (3).csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Performing Preprocessing Steps

## Data Cleaning Before Data Spliting

In [3]:
# Drop unnecessary columns: these columns don't make impact on the prediction of the sales.

df.drop(columns = ['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'],  axis = 1, inplace = True)

In [4]:
# Drop any duplicates
df.duplicated().sum()

0

In [6]:
# Checking missing values. And display the data information.
print(df.info())
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           7060 non-null   float64
 1   Item_Fat_Content      8523 non-null   object 
 2   Item_Visibility       8523 non-null   float64
 3   Item_Type             8523 non-null   object 
 4   Item_MRP              8523 non-null   float64
 5   Outlet_Size           6113 non-null   object 
 6   Outlet_Location_Type  8523 non-null   object 
 7   Outlet_Type           8523 non-null   object 
 8   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(4), object(5)
memory usage: 599.4+ KB
None


Item_Weight             1463
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
Item_Outlet_Sales          0
dtype: int64

In [7]:
# the stats informaiton of the data.
df.describe(include='all')

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
count,7060.0,8523,8523.0,8523,8523.0,6113,8523,8523,8523.0
unique,,5,,16,,3,3,4,
top,,Low Fat,,Fruits and Vegetables,,Medium,Tier 3,Supermarket Type1,
freq,,5089,,1232,,2793,3350,5577,
mean,12.857645,,0.066132,,140.992782,,,,2181.288914
std,4.643456,,0.051598,,62.275067,,,,1706.499616
min,4.555,,0.0,,31.29,,,,33.29
25%,8.77375,,0.026989,,93.8265,,,,834.2474
50%,12.6,,0.053931,,143.0128,,,,1794.331
75%,16.85,,0.094585,,185.6437,,,,3101.2964


In [8]:
# fix any inconsistent categories of data
# filter out all the object dtype columns
dtypes = df.dtypes
cat_cols = dtypes[dtypes=="object"].index
for col in cat_cols:
  print(f'-Column={col}')
  print(df[col].value_counts(dropna=False))
  print('\n\n')

-Column=Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



-Column=Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



-Column=Outlet_Size
Medium    2793
NaN       2410
Small     2388
High       932
Name: Outlet_Size, dtype: int64



-Column=Outlet_Location_Type
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64



-Column=Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3  

In [9]:
# address the inconsistence in Item_Fat_Content column.
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                        'low fat' : 'Low Fat',
                                                        'reg' : 'Regular'})
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

## Validation Split

In [10]:
# Split X and y.
X = df.drop(columns= 'Item_Outlet_Sales')
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Create a ColumnTransformer for Preprocessing

In [13]:
# Instantiate the transformers
scaler = StandardScaler()
mean_imputer = SimpleImputer(strategy='mean') # this used for dealing with the missing values in item_weight column.
freq_imputer = SimpleImputer(strategy='most_frequent') # this is used for dealing with the missing values in outlet_size column.
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ordinal = OrdinalEncoder()

In [14]:
# Prepare separate processing pipelines for numeric and categorical data
num_pipe = make_pipeline(mean_imputer, scaler)
nom_pipe = make_pipeline(ohe)
ord_pipe = make_pipeline(freq_imputer, ordinal)

In [15]:
# Create ColumnSelectors for the the numeric and categorical data
nominal_cols = ['Item_Type', 'Outlet_Location_Type', 'Outlet_Type']
ordinal_cols = ['Item_Fat_Content', 'Outlet_Size']
num_selector = make_column_selector(dtype_include='number')

In [16]:
# Combine the Pipelines and ColumnSelectors into tuples for the ColumnTransformer
nom_tuple = (nom_pipe, nominal_cols)
ord_tuple = (ord_pipe, ordinal_cols)
num_tuple = (num_pipe, num_selector)

In [17]:
# Create the preprocessing ColumnTransformer
preprocessor = make_column_transformer(nom_tuple, ord_tuple, num_tuple, remainder='drop')
preprocessor

## Check the Result

In [18]:
# fit the column transformer on the training data.
preprocessor.fit(X_train)
# transform both training and testing data.
X_train_transformed = preprocessor.transform(X_train)

X_test_transformed = preprocessor.transform(X_test)

X_train_transformed.shape

(6392, 28)

In [19]:
# convert the training data to dataframe.
X_train_df = pd.DataFrame(X_train_transformed)
X_train_df.head().round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.82,-0.71,1.83
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.56,-1.29,0.6
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,-0.13,1.81,0.24
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-1.17,-1.0,-0.95
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.53,-0.97,-0.34


In [21]:
X_train_df.isna().sum().sum()

0