<a href="https://colab.research.google.com/github/TopData530/Sales-Prediction/blob/main/ML_Sales_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sales Prediction in Machine Learning
##Eric Saechao

In [118]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn import set_config
set_config(display='diagram')

In [119]:
df = pd.read_csv('/content/sales_predictions (1).csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [120]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


All missing values handled

In [122]:
df.duplicated().sum()

0

##Identify the features (X) and target (y): Assign the "Item_Outlet_Sales" column as your target and the rest of the relevant variables as your features matrix

In [123]:
#Dropping irrelavent features
X = df.drop(columns={'Item_Identifier', 'Item_Fat_Content', 'Item_Visibility',
                     'Outlet_Location_Type','Item_Weight', 'Outlet_Size'})
y = df['Item_Outlet_Sales']

Target assigned to Item_Outlet_Sales

In [124]:
#Performed a train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape

(6392, 6)

Train test split applied

In [125]:
#Checking new columns after converting data
display(X_train.info())
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Type                  6392 non-null   object 
 1   Item_MRP                   6392 non-null   float64
 2   Outlet_Identifier          6392 non-null   object 
 3   Outlet_Establishment_Year  6392 non-null   int64  
 4   Outlet_Type                6392 non-null   object 
 5   Item_Outlet_Sales          6392 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 349.6+ KB


None

Unnamed: 0,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Type,Item_Outlet_Sales
4776,Household,256.4646,OUT018,2009,Supermarket Type2,515.3292
7510,Snack Foods,179.766,OUT018,2009,Supermarket Type2,3056.022
5828,Meat,157.2946,OUT049,1999,Supermarket Type1,1577.946
5327,Baking Goods,82.325,OUT035,2004,Supermarket Type1,1331.6
4810,Frozen Foods,120.9098,OUT045,2002,Supermarket Type1,1687.1372


Data is correctly split and ready for preprocessing

##Create a preprocessing object to prepare the dataset for Machine Learning

In [126]:
#Instatiating selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [127]:
#Adding in SimpleImputers, OneHotEncoder and Scaler
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

Imputers, Scalers and OneHotEncoders applied to data set

In [129]:
#Creating numerical pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [130]:
#Creating categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

In [131]:
#Combining pipelines with column transformation
num_tuple = (numeric_pipe, num_selector)
cat_tuple = (categorical_pipe, cat_selector)
preprocessor = make_column_transformer(num_tuple, cat_tuple)
preprocessor.fit(X_train)

Preprocess complete and ready to transform

In [132]:
#Transforming final object and displaying results
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_processed

array([[ 1.82810922,  1.32784893, -0.98508597, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60336888,  1.32784893,  0.49179752, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.24454056,  0.13618724, -0.36739571, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.52302674,  0.49368575,  2.28758876, ...,  1.        ,
         0.        ,  0.        ],
       [-0.38377708,  1.0895166 , -0.32559712, ...,  1.        ,
         0.        ,  0.        ],
       [-0.73836105, -0.10214509, -0.72345776, ...,  1.        ,
         0.        ,  0.        ]])

Data is Processed and ready for model