# Import the necessary libraries

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns


from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
import warnings
warnings.filterwarnings('ignore')

# Exploring the dataset

## 1. Freezer data

In [53]:
freezer_data_df = pd.read_csv('./dataset/freezer_data.csv')
freezer_data_df.head()

Unnamed: 0,Freezer Name,Model Number,Volume Capacity (Liters),Power and maitainance Cost (LKR) per 100 hours
0,ChillMaster,M001,55,2500
1,FreezeZone,M002,75,2800
2,CoolTech,M003,120,3200
3,mini CoolTech,M004,30,1800
4,IceBlast Pro,M005,150,3500


So there are 10 types of freezers. We need to cluster the shops and recommend the best freezer for each cluster.

In [54]:
freezer_data_df.shape

(10, 4)

In [55]:
freezer_data_df.head(10)

Unnamed: 0,Freezer Name,Model Number,Volume Capacity (Liters),Power and maitainance Cost (LKR) per 100 hours
0,ChillMaster,M001,55,2500
1,FreezeZone,M002,75,2800
2,CoolTech,M003,120,3200
3,mini CoolTech,M004,30,1800
4,IceBlast Pro,M005,150,3500
5,IceBlast,M006,65,2700
6,FreezeTech,M007,100,3000
7,FreezeTech 3.0,M008,25,1600
8,ChillTech,M009,80,2800
9,IceTech,M010,200,4500


In [76]:
freezer_data_df.isna().sum()

Freezer Name                                      0
Model Number                                      0
Volume Capacity (Liters)                          0
Power and maitainance Cost (LKR) per 100 hours    0
dtype: int64

## 2. Outlet Area data

In [56]:
outlets_area_df = pd.read_csv('./dataset/outlets_data.csv')
outlets_area_df.head()

Unnamed: 0,Outlet_ID,area(sqft)
0,ID7203,5660
1,ID1878,4138
2,ID1740,5546
3,ID4366,5134
4,ID8358,4100


In [57]:
outlets_area_df.shape

(988, 2)

In [58]:
outlets_area_df['Outlet_ID'].nunique()

951

In [59]:
# find the duplicated outlets id
# get the duplicated outlets id, and the duplicated rows
duplicated_ids = (
    outlets_area_df[outlets_area_df["Outlet_ID"].duplicated()]["Outlet_ID"]
    .unique()
    .tolist()
)

In [60]:
for dup_id in duplicated_ids:
    print(f"Outlet ID: {dup_id}")
    print(outlets_area_df[outlets_area_df["Outlet_ID"] == dup_id])
    print("-" * 20)

Outlet ID: ID8278
    Outlet_ID  area(sqft)
320    ID8278        3600
384    ID8278        4980
--------------------
Outlet ID: ID8631
    Outlet_ID  area(sqft)
25     ID8631        4024
406    ID8631        3196
--------------------
Outlet ID: ID4003
    Outlet_ID  area(sqft)
77     ID4003        4841
444    ID4003        2990
--------------------
Outlet ID: ID1365
    Outlet_ID  area(sqft)
98     ID1365        5823
494    ID1365        1291
--------------------
Outlet ID: ID2170
    Outlet_ID  area(sqft)
310    ID2170        8077
525    ID2170        6169
--------------------
Outlet ID: ID1485
    Outlet_ID  area(sqft)
292    ID1485        4535
537    ID1485        9483
--------------------
Outlet ID: ID9465
    Outlet_ID  area(sqft)
441    ID9465        1583
563    ID9465        6650
--------------------
Outlet ID: ID6214
    Outlet_ID  area(sqft)
232    ID6214        1608
564    ID6214        4052
--------------------
Outlet ID: ID3549
    Outlet_ID  area(sqft)
415    ID3549       

In [77]:
outlets_area_df.isna().sum()

Outlet_ID     0
area(sqft)    0
dtype: int64

## 3. Product data

In [61]:
product_data_df = pd.read_csv('./dataset/product_data.csv')
product_data_df.head()

Unnamed: 0,pid,volume,product_name,price
0,IP1,0.2,Vanilla Mini Cone,100
1,IP2,0.2,Chocolate Petite Bar,90
2,IP3,0.2,Strawberry Tiny Cup,110
3,IP4,0.2,Mint Bite-size Pop,100
4,IP5,0.2,Butter Pecan Small Stick,100


In [62]:
product_data_df.shape

(10, 4)

In [63]:
product_data_df.head(10)

Unnamed: 0,pid,volume,product_name,price
0,IP1,0.2,Vanilla Mini Cone,100
1,IP2,0.2,Chocolate Petite Bar,90
2,IP3,0.2,Strawberry Tiny Cup,110
3,IP4,0.2,Mint Bite-size Pop,100
4,IP5,0.2,Butter Pecan Small Stick,100
5,BP1,1.0,Chocolate Fudge Big Scoop,1200
6,BP2,1.0,Moose Tracks Jumbo Cup,1100
7,BP3,1.0,Mint Chocolate Chip Family Pack,1150
8,BP4,1.0,Rocky Road Mega Tub,1000
9,BP5,1.0,Cookies and Cream Mega Tub,1100


In [78]:
product_data_df.isna().sum()

pid             0
volume          0
product_name    0
price           0
dtype: int64

## 4. Sales data

In [64]:
sales_data_df = pd.read_csv('./dataset/sales_data.csv')
sales_data_df.head()

Unnamed: 0,Outlet_ID,week,pid,product_name,no_units
0,ID7203,1,IP1,Vanilla Mini Cone,874.0
1,ID7203,1,IP2,Chocolate Petite Bar,105.0
2,ID7203,1,IP3,Strawberry Tiny Cup,1198.0
3,ID7203,1,IP4,Mint Bite-size Pop,502.0
4,ID7203,1,IP5,Butter Pecan Small Stick,710.0


In [65]:
sales_data_df.shape

(81000, 5)

In [66]:
sales_data_df['Outlet_ID'].nunique()

951

In [67]:
sales_data_df['product_name'].nunique()

10

In [68]:
sales_data_df['week'].nunique()

12

In [69]:
sales_data_df.describe()

Unnamed: 0,week,no_units
count,81000.0,81000.0
mean,6.504432,373.143543
std,3.453426,370.546966
min,1.0,20.0
25%,4.0,79.0
50%,7.0,210.0
75%,10.0,593.0
max,12.0,1499.0


In [79]:
sales_data_df.isna().sum()

Outlet_ID       0
week            0
pid             0
product_name    0
no_units        0
dtype: int64

## 5. Week data

In [70]:
week_data_df = pd.read_csv('./dataset/week_data.csv')
week_data_df.head()

Unnamed: 0,Week,Start Date,End Date
0,Week 1,1/2/2023,1/8/2023
1,Week 2,1/9/2023,1/15/2023
2,Week 3,1/16/2023,1/22/2023
3,Week 4,1/23/2023,1/29/2023
4,Week 5,1/30/2023,2/5/2023


In [71]:
week_data_df.shape

(12, 3)

In [72]:
week_data_df.head(12)

Unnamed: 0,Week,Start Date,End Date
0,Week 1,1/2/2023,1/8/2023
1,Week 2,1/9/2023,1/15/2023
2,Week 3,1/16/2023,1/22/2023
3,Week 4,1/23/2023,1/29/2023
4,Week 5,1/30/2023,2/5/2023
5,Week 6,2/6/2023,2/12/2023
6,Week 7,2/13/2023,2/19/2023
7,Week 8,2/20/2023,2/26/2023
8,Week 9,2/27/2023,3/5/2023
9,Week 10,3/6/2023,3/12/2023


### Note:

We do have the data for three months <br>
We have weekly sales details as well. <br>

Extract the day and the month from the date and add it to the sales data. <br>
Then we can merge it with our own dataframes. <br> 

There are some special days in these three months. <br>
- 1st Jan 2023
- 15th Jan 2023
- 14th Feb 2023

### Extracting the day and the month from the date

In [73]:
week_data_df.dtypes

Week          object
Start Date    object
End Date      object
dtype: object

In [74]:
week_data_df['Start Date'] = pd.to_datetime(week_data_df['Start Date'])

In [75]:
# convert the 'End Date' column to a datetime format
week_data_df['End Date'] = pd.to_datetime(week_data_df['End Date'])

# extract the week of the transaction and store it in a new column
week_data_df['month_of_transaction'] = week_data_df['End Date'].dt.month

week_data_df.head()

Unnamed: 0,Week,Start Date,End Date,month_of_transaction
0,Week 1,2023-01-02,2023-01-08,1
1,Week 2,2023-01-09,2023-01-15,1
2,Week 3,2023-01-16,2023-01-22,1
3,Week 4,2023-01-23,2023-01-29,1
4,Week 5,2023-01-30,2023-02-05,2
