# Chocolate Dataset Exploration

## Import & Setup

In [5]:
import pandas as pd
import torch
import numpy as np

df = pd.read_csv('data/chocolate_sales.csv')
df.head()

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04-Jan-22,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,01-Aug-22,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,07-Jul-22,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27-Apr-22,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24-Feb-22,"$13,685",184


## Data Preprocessing

In [8]:
df.isna().any()

Sales Person     False
Country          False
Product          False
Date             False
Amount           False
Boxes Shipped    False
dtype: bool

There is no NaN values (already cleaned)

In [31]:
df_encoded = df.copy()

In [32]:
df_encoded['Date'] = pd.to_datetime(df_encoded['Date'], format='%d-%b-%y')

In [33]:
df_encoded['Day'] = df_encoded['Date'].dt.day
df_encoded['Month'] = df_encoded['Date'].dt.month
df_encoded['Year'] = df_encoded['Date'].dt.year
df_encoded['Weekday'] = df_encoded['Date'].dt.day_name()

In [34]:
df_encoded

Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped,Day,Month,Year,Weekday
0,Jehu Rudeforth,UK,Mint Chip Choco,2022-01-04,"$5,320",180,4,1,2022,Tuesday
1,Van Tuxwell,India,85% Dark Bars,2022-08-01,"$7,896",94,1,8,2022,Monday
2,Gigi Bohling,India,Peanut Butter Cubes,2022-07-07,"$4,501",91,7,7,2022,Thursday
3,Jan Morforth,Australia,Peanut Butter Cubes,2022-04-27,"$12,726",342,27,4,2022,Wednesday
4,Jehu Rudeforth,UK,Peanut Butter Cubes,2022-02-24,"$13,685",184,24,2,2022,Thursday
...,...,...,...,...,...,...,...,...,...,...
1089,Karlen McCaffrey,Australia,Spicy Special Slims,2022-05-17,"$4,410",323,17,5,2022,Tuesday
1090,Jehu Rudeforth,USA,White Choc,2022-06-07,"$6,559",119,7,6,2022,Tuesday
1091,Ches Bonnell,Canada,Organic Choco Syrup,2022-07-26,$574,217,26,7,2022,Tuesday
1092,Dotty Strutley,India,Eclairs,2022-07-28,"$2,086",384,28,7,2022,Thursday


In [35]:
(df_encoded.Year == 2022).all()

True

In [36]:
df_encoded.drop(columns=['Year', 'Date'], inplace=True)

In [37]:
df_encoded = pd.get_dummies(df_encoded, columns=['Day', 'Month', 'Weekday'], prefix=['Day', 'Month', 'Weekday'], dtype=int)

In [38]:
df_encoded.head()

Unnamed: 0,Sales Person,Country,Product,Amount,Boxes Shipped,Day_1,Day_2,Day_3,Day_4,Day_5,...,Month_4,Month_5,Month_6,Month_7,Month_8,Weekday_Friday,Weekday_Monday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,Jehu Rudeforth,UK,Mint Chip Choco,"$5,320",180,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,Van Tuxwell,India,85% Dark Bars,"$7,896",94,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,Gigi Bohling,India,Peanut Butter Cubes,"$4,501",91,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,Jan Morforth,Australia,Peanut Butter Cubes,"$12,726",342,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,Jehu Rudeforth,UK,Peanut Butter Cubes,"$13,685",184,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Statistics