<a href="https://colab.research.google.com/github/Richard-Shimada/food-sales-predictions/blob/main/Project_1_Part_5_(Core)_RS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
#import libraries and load data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

df = pd.read_csv('/content/drive/MyDrive/Coding Dojo 2022/05 Week 5: Intro to Machine Learning and Pre-Processing/sales_predictions_original.csv')
sales_df = df.copy()
sales_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [69]:
#no duplicates
sales_df.duplicated().sum()



0

In [70]:
#change all values to Low Fat or Regular. Will ordinal encode assuming that the average consumer will be health conscious where possible, can also see the one hot encode point of view one is not inherently better than the other in terms of predicting sales
sales_df['Item_Fat_Content'].value_counts()
sales_df['Item_Fat_Content'].replace({'Low Fat':0,'LF':0,'low fat':0,'Regular':1,'reg':1},inplace=True)
sales_df['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

In [71]:
sales_df['Outlet_Identifier'].nunique()
#sales_df.shape

10

In [72]:
# Identifying the features (X) and target (y): "Item_Outlet_Sales" column is the target; dropped Item_Identifier and Outlet_Identifier from the features matrix as they are just IDs that wouldn't have any predicting power 

X = sales_df.drop(['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'],axis=1)
y = sales_df['Item_Outlet_Sales']

In [73]:
# Perform a train test split 
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [74]:
# Create a preprocessing object to prepare the dataset for Machine Learning
categorical_columns = make_column_selector(dtype_include='object')
numeric_columns = make_column_selector(dtype_include='number')

In [75]:
#Need to address null values in Item_weight and Outlet_Size
X_train.isna().sum()

Item_Weight                  1107
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Establishment_Year       0
Outlet_Size                  1812
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [76]:

#X_train['Item_Weight'].nunique()
#Since item weight is continuous and has many different values,  we will use the mean to impute values

#For rows where Outlet_Size is not null,Outlet_Location_Type of Tier 2 only appears in Outlet_Size of Small, same for Outlet_Type = Grocery Store.
#For all rows where Outlet_Size is null, values of either Tier 2 or Grocery are in each row! So we will impute the NaN to Small
#X_train['Outlet_Size'].value_counts()
#X_train.groupby(['Outlet_Size','Outlet_Location_Type','Outlet_Type'],dropna=False)['Item_MRP'].sum()



In [77]:
#instantiate scalar and ohe
#For imputing, using the ‘mean’ strategy for Item_Weight and the constant(Small) strategy for Outlet_Size
constant_imputer = SimpleImputer(strategy='constant',fill_value='Small')
mean_imputer = SimpleImputer(strategy='mean')
# Scaler
scaler = StandardScaler()
# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [78]:
numeric_pipeline = make_pipeline(mean_imputer,scaler)
numeric_pipeline

In [79]:
categorical_pipeline = make_pipeline(constant_imputer,ohe)
categorical_pipeline

In [80]:
# Tuples for Column Transformer
number_tuple = (numeric_pipeline, numeric_columns)
category_tuple = (categorical_pipeline, categorical_columns)
# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple)
preprocessor

In [81]:
#fit train data to column transformer and then transform data

preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [82]:
#resulting numpy array
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')
X_train_processed

0 missing values in training data
0 missing values in testing data


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 31)




array([[ 0.81724868, -0.7403206 , -0.71277507, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.5563395 ,  1.35076614, -1.29105225, ...,  0.        ,
         1.        ,  0.        ],
       [-0.13151196,  1.35076614,  1.81331864, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.11373638, -0.7403206 , -0.92052713, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.76600931, -0.7403206 , -0.2277552 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.81724868, -0.7403206 , -0.95867683, ...,  1.        ,
         0.        ,  0.        ]])