# Chocolate Dataset Exploration

## Import & Setup

In [5]:
import pandas as pd
import torch
import numpy as np

df = pd.read_csv('data/chocolate_sales.csv')
df.head()

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04-Jan-22,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,01-Aug-22,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,07-Jul-22,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27-Apr-22,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24-Feb-22,"$13,685",184


## Data Preprocessing

In [8]:
df.isna().any()

Sales Person     False
Country          False
Product          False
Date             False
Amount           False
Boxes Shipped    False
dtype: bool

There is no NaN values (already cleaned)

In [31]:
df_encoded = df.copy()

In [32]:
df_encoded['Date'] = pd.to_datetime(df_encoded['Date'], format='%d-%b-%y')

In [33]:
df_encoded['Day'] = df_encoded['Date'].dt.day
df_encoded['Month'] = df_encoded['Date'].dt.month
df_encoded['Year'] = df_encoded['Date'].dt.year
df_encoded['Weekday'] = df_encoded['Date'].dt.day_name()

In [34]:
df_encoded

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped,Day,Month,Year,Weekday
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-01-04,"$5,320",180,4,1,2022,Tuesday
1,Van Tuxwell,India,85% Dark Bars,2022-08-01,"$7,896",94,1,8,2022,Monday
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,"$4,501",91,7,7,2022,Thursday
3,Jan Morforth,Australia,Peanut Butter Cubes,2022-04-27,"$12,726",342,27,4,2022,Wednesday
4,Jehu Rudeforth,UK,Peanut Butter Cubes,2022-02-24,"$13,685",184,24,2,2022,Thursday
...,...,...,...,...,...,...,...,...,...,...
1089,Karlen McCaffrey,Australia,Spicy Special Slims,2022-05-17,"$4,410",323,17,5,2022,Tuesday
1090,Jehu Rudeforth,USA,White Choc,2022-06-07,"$6,559",119,7,6,2022,Tuesday
1091,Ches Bonnell,Canada,Organic Choco Syrup,2022-07-26,$574,217,26,7,2022,Tuesday
1092,Dotty Strutley,India,Eclairs,2022-07-28,"$2,086",384,28,7,2022,Thursday


In [35]:
(df_encoded.Year == 2022).all()

True

In [36]:
df_encoded.drop(columns=['Year', 'Date'], inplace=True)

In [None]:
# Check if all dummies are written
df_encoded = pd.get_dummies(df_encoded, columns=['Day', 'Month', 'Weekday'], prefix=['Day', 'Month', 'Weekday'], dtype=int)

In [38]:
df_encoded.head()

Unnamed: 0,Sales Person,Country,Product,Amount,Boxes Shipped,Day_1,Day_2,Day_3,Day_4,Day_5,...,Month_4,Month_5,Month_6,Month_7,Month_8,Weekday_Friday,Weekday_Monday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,Jehu Rudeforth,UK,Mint Chip Choco,"$5,320",180,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,Van Tuxwell,India,85% Dark Bars,"$7,896",94,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,Gigi Bohling,India,Peanut Butter Cubes,"$4,501",91,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,Jan Morforth,Australia,Peanut Butter Cubes,"$12,726",342,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,Jehu Rudeforth,UK,Peanut Butter Cubes,"$13,685",184,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
df_encoded['Amount'] = df_encoded['Amount'].replace('[\$,]', '', regex=True).astype(int)

In [41]:
df_encoded

Unnamed: 0,Sales Person,Country,Product,Amount,Boxes Shipped,Day_1,Day_2,Day_3,Day_4,Day_5,...,Month_4,Month_5,Month_6,Month_7,Month_8,Weekday_Friday,Weekday_Monday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,Jehu Rudeforth,UK,Mint Chip Choco,5320,180,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,Van Tuxwell,India,85% Dark Bars,7896,94,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,Gigi Bohling,India,Peanut Butter Cubes,4501,91,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,Jan Morforth,Australia,Peanut Butter Cubes,12726,342,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,Jehu Rudeforth,UK,Peanut Butter Cubes,13685,184,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,Karlen McCaffrey,Australia,Spicy Special Slims,4410,323,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1090,Jehu Rudeforth,USA,White Choc,6559,119,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1091,Ches Bonnell,Canada,Organic Choco Syrup,574,217,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1092,Dotty Strutley,India,Eclairs,2086,384,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


## Statistics

In [42]:
df_encoded['Amount'].mean()

5652.308043875685

In [46]:
df_encoded['Country'].unique()

array(['UK', 'India', 'Australia', 'New Zealand', 'USA', 'Canada'],
      dtype=object)

In [49]:
prod_by_countries = df_encoded.groupby('Country')['Product'].value_counts()

In [50]:
prod_by_countries['UK']

Product
50% Dark Bites          12
Drinking Coco           12
White Choc              12
99% Dark & Pure         11
Smooth Sliky Salty      11
Baker's Choco Chips      9
Caramel Stuffed Bars     9
Choco Coated Almonds     9
Milk Bars                9
Peanut Butter Cubes      9
Almond Choco             8
Eclairs                  8
Manuka Honey Choco       8
70% Dark Bites           6
85% Dark Bars            6
Mint Chip Choco          6
Orange Choco             6
Organic Choco Syrup      6
Raspberry Choco          6
After Nines              5
Fruit & Nut Bars         5
Spicy Special Slims      5
Name: count, dtype: int64

In [None]:
prod_by_countries['India']

Product
Eclairs                 15
Spicy Special Slims     14
Smooth Sliky Salty      11
After Nines             10
Mint Chip Choco         10
85% Dark Bars            9
Almond Choco             9
Caramel Stuffed Bars     9
Peanut Butter Cubes      9
50% Dark Bites           8
99% Dark & Pure          8
Drinking Coco            8
Organic Choco Syrup      8
70% Dark Bites           7
Choco Coated Almonds     7
Orange Choco             7
Raspberry Choco          7
White Choc               7
Fruit & Nut Bars         6
Milk Bars                6
Baker's Choco Chips      5
Manuka Honey Choco       4
Name: count, dtype: int64

In [52]:
prod_by_countries['Australia']

Product
50% Dark Bites          16
Drinking Coco           12
Organic Choco Syrup     12
99% Dark & Pure         11
Almond Choco            11
Milk Bars               11
Raspberry Choco         11
White Choc              11
Eclairs                 10
70% Dark Bites           9
Baker's Choco Chips      9
Fruit & Nut Bars         9
85% Dark Bars            8
Manuka Honey Choco       8
Orange Choco             8
Smooth Sliky Salty       8
Spicy Special Slims      8
After Nines              7
Caramel Stuffed Bars     7
Mint Chip Choco          7
Choco Coated Almonds     6
Peanut Butter Cubes      6
Name: count, dtype: int64

In [53]:
prod_by_countries['New Zealand']

Product
Mint Chip Choco         14
85% Dark Bars           12
After Nines             12
Organic Choco Syrup     10
Eclairs                  9
Smooth Sliky Salty       9
White Choc               9
Baker's Choco Chips      8
Fruit & Nut Bars         8
Manuka Honey Choco       8
Peanut Butter Cubes      8
Spicy Special Slims      8
50% Dark Bites           7
Orange Choco             7
Raspberry Choco          7
70% Dark Bites           6
Caramel Stuffed Bars     6
Drinking Coco            6
Milk Bars                6
99% Dark & Pure          5
Almond Choco             5
Choco Coated Almonds     3
Name: count, dtype: int64

In [54]:
prod_by_countries['USA']

Product
Fruit & Nut Bars        12
50% Dark Bites          11
Orange Choco            11
Raspberry Choco         11
Milk Bars               10
White Choc              10
85% Dark Bars            9
After Nines              9
Drinking Coco            9
Eclairs                  9
Manuka Honey Choco       9
99% Dark & Pure          8
Smooth Sliky Salty       8
Spicy Special Slims      8
Almond Choco             7
Peanut Butter Cubes      7
70% Dark Bites           6
Caramel Stuffed Bars     6
Organic Choco Syrup      6
Baker's Choco Chips      5
Choco Coated Almonds     4
Mint Chip Choco          4
Name: count, dtype: int64

In [55]:
prod_by_countries['Canada']

Product
Smooth Sliky Salty      12
Spicy Special Slims     11
Choco Coated Almonds    10
Fruit & Nut Bars        10
Organic Choco Syrup     10
Peanut Butter Cubes     10
Drinking Coco            9
Eclairs                  9
White Choc               9
70% Dark Bites           8
Almond Choco             8
Manuka Honey Choco       8
Orange Choco             8
After Nines              7
Milk Bars                7
50% Dark Bites           6
85% Dark Bars            6
99% Dark & Pure          6
Caramel Stuffed Bars     6
Raspberry Choco          6
Baker's Choco Chips      5
Mint Chip Choco          4
Name: count, dtype: int64