In [1]:
## Pandas
import pandas as pd
## Numpy
import numpy as np

## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
## Models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

## Regression Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

## Set global scikit-learn configuration 
from sklearn import set_config
## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}

#Function#

In [2]:
## Create a function to take the true and predicted values
## and print MAE, MSE, RMSE, and R2 metrics for a model
def model_metrics(pipe, x_train, y_train, x_test, y_test, 
                       model_name='Regression Model', ):
  ## Train
  mae = round(mean_absolute_error(y_train, pipe.predict(x_train)),4)
  mse = round(mean_squared_error(y_train, pipe.predict(x_train)),4)
  rmse = round(np.sqrt(mean_squared_error(y_train, pipe.predict(x_train))),4)
  r2 = round(r2_score(y_train, pipe.predict(x_train)),6)
  print(f'{model_name} Train Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

  ## Test
  mae = round(mean_absolute_error(y_test, pipe.predict(x_test)),4)
  mse = round(mean_squared_error(y_test, pipe.predict(x_test)),4)
  rmse = round(np.sqrt(mean_squared_error(y_test, pipe.predict(x_test))),4)
  r2 = round(r2_score(y_test, pipe.predict(x_test)),6)

  # Display the metrics for the model
  print(f'{model_name} Test Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

##**Load Data**

In [3]:
# Load in the data
df = pd.read_csv('/content/sales_predictions - sales_predictions (5).csv')
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
# make a copy of my database
eda_ml =df.copy()

###**Duplication**

In [5]:
# Drop rows

eda_ml.drop_duplicates(inplace=True)

In [6]:
# duplicated rows
eda_ml.duplicated().sum()

0

In [7]:
# info
eda_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 865.6+ KB


In [8]:
eda_ml.replace(["LF"], ["Low Fat"], inplace=True)
eda_ml.replace(["low fat"], ["Low Fat"], inplace=True)
eda_ml.replace(["reg"], ["Regular"], inplace=True)
eda_ml["Item_Fat_Content"].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
# descriptive for all collumns
eda_ml.describe(include='all')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
count,8523,7060.0,8523,8523.0,8523,8523.0,8523,8523.0,6113,8523,8523,8523.0
unique,1559,,2,,16,,10,,3,3,4,
top,FDW13,,Low Fat,,Fruits and Vegetables,,OUT027,,Medium,Tier 3,Supermarket Type1,
freq,10,,5517,,1232,,935,,2793,3350,5577,
mean,,12.857645,,0.066132,,140.992782,,1997.831867,,,,2181.288914
std,,4.643456,,0.051598,,62.275067,,8.37176,,,,1706.499616
min,,4.555,,0.0,,31.29,,1985.0,,,,33.29
25%,,8.77375,,0.026989,,93.8265,,1987.0,,,,834.2474
50%,,12.6,,0.053931,,143.0128,,1999.0,,,,1794.331
75%,,16.85,,0.094585,,185.6437,,2004.0,,,,3101.2964


###**Split Data**

In [10]:
X = eda_ml.drop(columns = ['Item_Outlet_Sales'])
y = eda_ml['Item_Outlet_Sales']




In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [12]:
## The target is Item_Outlet_Sales which we try to predict

###**Preprocessing**

In [13]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')



## Transformers

In [14]:
# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy = 'mean')
# Scaler
scaler = StandardScaler()

# one Hot encoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

- Nominal is one hot-encode
- Numeric is scaler 

###**Numeric Pipeline**

In [15]:
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [16]:
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

###**Instantiate Column Transformer**

In [17]:
# Tuple for column Tranformer 
num_tuple= (numeric_pipe, num_selector)
cat_tuple = (categorical_pipe, cat_selector)

###**ColumnTransformer**

In [18]:
preprocessor = make_column_transformer(num_tuple, 
                                       cat_tuple, 
                                       remainder='passthrough')
preprocessor

###**Fit and Transform Data**

In [19]:
# fit on train 
preprocessor.fit(X_train)


In [20]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [21]:
## I hope that I did not have data leakage because I had not run the "Fit" comment with the preprocessor on the column transformer cell

## Data Model

In [22]:
# Create a dummy model 
dummy = DummyRegressor(strategy = 'mean')

# a pipeline model
dummy_pipe = make_pipeline(preprocessor, dummy)

# Fit the model

dummy_pipe.fit(X_train, y_train)

###**Metrics**

In [23]:
import sklearn.metrics as metrics

In [24]:
## Display model performance metrics using a function
model_metrics(dummy_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Dummy Model')

Dummy Model Train Scores
MAE: 1,360.2184 
MSE: 2,959,455.7045 
RMSE: 1,720.3069 
R2: 0.0000

Dummy Model Test Scores
MAE: 1,326.1210 
MSE: 2,772,144.4627 
RMSE: 1,664.9758 
R2: -0.0048



In [25]:
## We got very bad R2 here, because it could not be zero or negative
# On the other hand, we have a rmse function in this dataset, it give use a close range of train and test score.

###**Linear regression model**

In [26]:
# Create an instance of model 
lin_reg = LinearRegression()

# create a pipeline for regression model
lin_reg_pipe = make_pipeline(preprocessor, lin_reg)

# fit on model
lin_reg_pipe.fit(X_train, y_train)

In [27]:
# Metrics for linear regression 
model_metrics(lin_reg_pipe, x_train=X_train, y_train=y_train,
                          x_test=X_test, y_test=y_test,
                            model_name='Linear Regrssion')

Linear Regrssion Train Scores
MAE: 736.1576 
MSE: 972,365.2275 
RMSE: 986.0858 
R2: 0.6714

Linear Regrssion Test Scores
MAE: 558,742,259,972.5935 
MSE: 45,732,477,819,291,585,684,176,896.0000 
RMSE: 6,762,579,228,318.9990 
R2: -16575880497968222208.0000



In [28]:
# The R2 for train score is acceptable but not for test score becaseu the R2 can not be negative, Moreover, it could be underfited by
# reasoning of less amount of data in our dataset.
# The Rmse in the test and train scores are very far aprt from each others, but both keep the same units

#**Regressions' trees model to predict sales**

###**Decision Tree**

In [29]:
## Import the model for regressor
from sklearn.tree import DecisionTreeRegressor

In [30]:
## Create a decsing tree
dec_tree = DecisionTreeRegressor()

## Pipeline

dec_tree_pipe = make_pipeline(preprocessor, dec_tree)

# Fit the model
dec_tree_pipe.fit(X_train, y_train)

In [31]:
# Metrics for Decision tree regressor
model_metrics(dec_tree_pipe, x_train=X_train, y_train=y_train,
                            x_test=X_test, y_test=y_test,
                              model_name='Decision Tree model')

Decision Tree model Train Scores
MAE: 0.0000 
MSE: 0.0000 
RMSE: 0.0000 
R2: 1.0000

Decision Tree model Test Scores
MAE: 996.2008 
MSE: 2,143,795.5852 
RMSE: 1,464.1706 
R2: 0.2230



In [32]:
# We got overfitted model for the reason of R2 based on the test and train set.
# Rmse is zero for the train score but it is not negative which is very effective on our dataset, also the error that happened on the train score
# did not effect on test score 

####**1.Bagged Tree model**

In [33]:
## Import the model
from sklearn.ensemble import BaggingRegressor

In [34]:
# Create a bagging model
bag_tree = BaggingRegressor()

# pipeline model
bag_tree_pipe = make_pipeline(preprocessor, bag_tree)

# fit the model

bag_tree_pipe.fit(X_train, y_train)

In [35]:
## Metrics for bagged Tree
model_metrics(bag_tree_pipe, x_train=X_train, y_train=y_train,
                            x_test=X_test, y_test=y_test,
                              model_name='Bagged Tree Model')

Bagged Tree Model Train Scores
MAE: 317.2069 
MSE: 235,784.6293 
RMSE: 485.5766 
R2: 0.9203

Bagged Tree Model Test Scores
MAE: 792.3817 
MSE: 1,305,845.2042 
RMSE: 1,142.7358 
R2: 0.5267



In [36]:
# We have high variance data here which is overfitted based on the train and test score. Train's R2 is 0.91 and the test's R2 is 0.51.
# Rmse is acceptable here since it's giving us the average predicted values for the test and train data set.

####**2.Random forest model**

In [37]:
## random forest model 
from sklearn.ensemble import RandomForestRegressor

In [38]:
# create a model 
ran_for = RandomForestRegressor()

# pipeline

ran_for_pipe = make_pipeline(preprocessor, ran_for)

# fit my model 

ran_for_pipe.fit(X_train, y_train)

In [39]:
## metrics
model_metrics(ran_for_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Random Forest Model')

Random Forest Model Train Scores
MAE: 298.2228 
MSE: 182,861.8997 
RMSE: 427.6235 
R2: 0.9382

Random Forest Model Test Scores
MAE: 770.4528 
MSE: 1,235,359.7329 
RMSE: 1,111.4674 
R2: 0.5522



In [40]:
## Based on R2 we got overfitted which is high variance data.
##Rmse is acceptable since it is giving us the average of the values and it is not negative  

####**3.K-nearest neighbors model**

In [41]:
## model for the K nearst
from sklearn.neighbors import KNeighborsRegressor

In [44]:
## Create model for kNeighbots
knn = KNeighborsRegressor()


# create a pipeline
knn_pipe = make_pipeline(preprocessor, knn)

## fit my model
knn_pipe.fit(X_train, y_train)

###**I prefer the RMSE model since it gives us a better number in the case of the average than R2, also RMSE does not change the values and never gives us negative results.**