# Importing Data

In [1]:
import pandas as pd
import numpy as np
path = 'cookie_business.xlsx'
dirty_df = pd.read_excel(
     path,
     engine='openpyxl',
)

In [2]:
dirty_df.head(15)

Unnamed: 0,Customer ID,Age,Age Group,Postcode,Gender,Favourite Cookie,Cookies bought each week
0,1001,60,60-69,2000,M,Choc chip,1
1,1002,53,50-59,2010,M,Choc chip,1
2,1003,22,20-29,2010,F,Choc chip,2
3,1004,30,30-39,2010,F,Choc chip,6
4,1005,52,50-59,2010,F,Macadamia,3
5,1006,22,20-29,2022,F,Macadamia,3
6,1007,26,20-29,2010,F,Macadamia,8
7,1008,40,40-49,2022,F,Triple choc,2
8,1009,42,40-49,2022,F,Granola,1
9,1010,22,20-29,2000,M,Granola,3


In [3]:
dirty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 7 columns):
Customer ID                 46 non-null int64
Age                         46 non-null int64
Age Group                   46 non-null object
Postcode                    46 non-null int64
Gender                      46 non-null object
Favourite Cookie            46 non-null object
Cookies bought each week    46 non-null int64
dtypes: int64(4), object(3)
memory usage: 2.6+ KB


In [4]:
dirty_df.describe()

Unnamed: 0,Customer ID,Age,Postcode,Cookies bought each week
count,46.0,46.0,46.0,46.0
mean,1023.5,34.173913,2136.217391,3.978261
std,13.422618,16.189577,204.315433,3.666601
min,1001.0,12.0,2000.0,1.0
25%,1012.25,20.25,2000.0,1.25
50%,1023.5,31.5,2014.5,3.0
75%,1034.75,44.75,2296.25,5.75
max,1046.0,68.0,2873.0,20.0


# Why we chose this dataset

We chose this dataset because it represents a simplified example of a start-up cookie business research. Once organized, the data allows us to see what areas, age groups, and flavors we should focus on. 

# Cleaning Data

Note: Male = 0 and Female = 1

In [5]:
dirty_df["Gender"].replace('M', 0, regex = True, inplace=True)
dirty_df["Gender"].replace('F', 1, regex = True, inplace=True)

In [8]:
clean_df = dirty_df["Gender"] = pd.to_numeric(dirty_df["Gender"])
clean_df = dirty_df
clean_df

Unnamed: 0,Customer ID,Age,Age Group,Postcode,Gender,Favourite Cookie,Cookies bought each week
0,1001,60,60-69,2000,0,Choc chip,1
1,1002,53,50-59,2010,0,Choc chip,1
2,1003,22,20-29,2010,1,Choc chip,2
3,1004,30,30-39,2010,1,Choc chip,6
4,1005,52,50-59,2010,1,Macadamia,3
5,1006,22,20-29,2022,1,Macadamia,3
6,1007,26,20-29,2010,1,Macadamia,8
7,1008,40,40-49,2022,1,Triple choc,2
8,1009,42,40-49,2022,1,Granola,1
9,1010,22,20-29,2000,0,Granola,3


# Describing Data

Grouping by favorite cookie, finding average of columns, and then sorting by cookies bought each week

In [17]:
modified_df = clean_df.groupby("Favourite Cookie").mean().sort_values("Cookies bought each week", ascending= False)
modified_df[['Cookies bought each week']]

Unnamed: 0_level_0,Cookies bought each week
Favourite Cookie,Unnamed: 1_level_1
Choc chip,4.833333
Granola,4.5
Macadamia,4.4375
Mint,2.75
Triple choc,2.333333
Salted caramel,1.0


Grouping by age group, finding average of columns, and then sorting by age

In [19]:
modified_df = clean_df.groupby("Age Group").mean().sort_values("Age", ascending= True)
modified_df[['Cookies bought each week']]

Unnamed: 0_level_0,Cookies bought each week
Age Group,Unnamed: 1_level_1
10-19,5.818182
20-29,3.7
30-39,3.333333
40-49,3.166667
50-59,3.6
60-69,3.0


Grouping by postal code, finding average of columns, and then sorting by cookies bought each week

In [None]:
clean_df.groupby("Postcode").mean().sort_values("Cookies bought each week", ascending= False)