# Importing Data

In [2]:
import pandas as pd
import numpy as np
path = '../data/cookie_business.xlsx'
dirty_df = pd.read_excel(path, index_col=0)

In [5]:
dirty_df.head(15)

Unnamed: 0_level_0,Age,Age Group,Postcode,Gender,Favourite Cookie,Cookies bought each week
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001,60,60-69,2000,M,Choc chip,1
1002,53,50-59,2010,M,Choc chip,1
1003,22,20-29,2010,F,Choc chip,2
1004,30,30-39,2010,F,Choc chip,6
1005,52,50-59,2010,F,Macadamia,3
1006,22,20-29,2022,F,Macadamia,3
1007,26,20-29,2010,F,Macadamia,8
1008,40,40-49,2022,F,Triple choc,2
1009,42,40-49,2022,F,Granola,1
1010,22,20-29,2000,M,Granola,3


In [6]:
dirty_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 1001 to 1046
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       46 non-null     int64 
 1   Age Group                 46 non-null     object
 2   Postcode                  46 non-null     int64 
 3   Gender                    46 non-null     object
 4   Favourite Cookie          46 non-null     object
 5   Cookies bought each week  46 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 2.5+ KB


In [7]:
dirty_df.describe()

Unnamed: 0,Age,Postcode,Cookies bought each week
count,46.0,46.0,46.0
mean,34.173913,2136.217391,3.978261
std,16.189577,204.315433,3.666601
min,12.0,2000.0,1.0
25%,20.25,2000.0,1.25
50%,31.5,2014.5,3.0
75%,44.75,2296.25,5.75
max,68.0,2873.0,20.0


# Why we chose this dataset

We chose this dataset because it represents a simplified example of a start-up cookie business research. Once organized, the data allows us to see what areas, age groups, and flavors we should focus on. 

# Cleaning Data

Note: Male = 0 and Female = 1

In [8]:
dirty_df["Gender"].replace('M', 0, regex = True, inplace=True)
dirty_df["Gender"].replace('F', 1, regex = True, inplace=True)

In [9]:
clean_df = dirty_df["Gender"] = pd.to_numeric(dirty_df["Gender"])
clean_df = dirty_df

# Describing Data

Grouping by favorite cookie, finding average of columns, and then sorting by cookies bought each week

In [10]:
clean_df.groupby("Favourite Cookie").mean().sort_values("Cookies bought each week", ascending= False)

Unnamed: 0_level_0,Age,Postcode,Gender,Cookies bought each week
Favourite Cookie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Choc chip,34.75,2139.166667,0.666667,4.833333
Granola,25.333333,2129.666667,0.5,4.5
Macadamia,34.9375,2119.375,0.5625,4.4375
Mint,38.5,2111.0,0.75,2.75
Triple choc,39.833333,2116.833333,0.666667,2.333333
Salted caramel,25.5,2381.5,0.0,1.0


Grouping by age group, finding average of columns, and then sorting by age

In [11]:
clean_df.groupby("Age Group").mean().sort_values("Age", ascending= True)

Unnamed: 0_level_0,Age,Postcode,Gender,Cookies bought each week
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10-19,16.636364,2282.181818,0.636364,5.818182
20-29,23.8,2057.6,0.6,3.7
30-39,33.111111,2217.888889,0.555556,3.333333
40-49,43.166667,2095.833333,1.0,3.166667
50-59,55.0,2010.0,0.2,3.6
60-69,63.8,2000.0,0.4,3.0


Grouping by postal code, finding average of columns, and then sorting by cookies bought each week

In [12]:
clean_df.groupby("Postcode").mean().sort_values("Cookies bought each week", ascending= False)

Unnamed: 0_level_0,Age,Gender,Cookies bought each week
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2344,18.0,1.0,20.0
2331,17.0,1.0,10.5
2210,31.0,0.5,10.0
2873,33.0,0.0,6.0
2030,57.0,0.0,6.0
2010,36.6,0.8,4.0
2333,17.0,0.0,4.0
2321,25.333333,0.666667,3.333333
2000,44.0,0.470588,3.235294
2019,29.0,1.0,3.0
