In [69]:
#https://preppindata.blogspot.com/2021/01/2021-week-2.html

import pandas as pd
import numpy as np
from datetime import datetime as dt

### Input the data

In [59]:
df = pd.read_csv(r'data/PD 2021 Wk 2 Input - Bike Model Sales.csv')
df

Unnamed: 0,Bike Type,Store,Order Date,Quantity,Value per Bike,Shipping Date,Model
0,Mountain,Manchester,15/05/2020,4,1543,01/06/2020,GIA31292/003
1,Gravel,Manchester,16/06/2020,2,2076,24/06/2020,GIA21312/001
2,Road,Birmingham,04/05/2020,1,2616,13/05/2020,GIA94221/129
3,Gravel,York,05/09/2020,2,1359,19/09/2020,GIA12442/120
4,Gravel,Birmingham,28/03/2020,4,1599,04/04/2020,GIA12492/123
...,...,...,...,...,...,...,...
1995,Road,Manchester,02/06/2020,2,3504,20/06/2020,102SPEC84233
1996,Mountain,York,07/09/2020,2,1109,21/09/2020,012SPEC93591
1997,Road,London,10/12/2020,3,1032,11/12/2020,943SPEC24922
1998,Gravel,London,10/02/2020,2,2303,11/02/2020,429SPEC21322


### Clean up the Model field to leave only the letters to represent the Brand of the bike

In [89]:
df['Model'] = df['Model'].str.extract(r'([a-zA-Z]+)')
df

Unnamed: 0,Bike Type,Store,Order Date,Quantity,Value per Bike,Shipping Date,Model,Order Value,Days to Ship
0,Mountain,Manchester,2020-05-15,4,1543,2020-06-01,GIA,6172,17
1,Gravel,Manchester,2020-06-16,2,2076,2020-06-24,GIA,4152,8
2,Road,Birmingham,2020-05-04,1,2616,2020-05-13,GIA,2616,9
3,Gravel,York,2020-09-05,2,1359,2020-09-19,GIA,2718,14
4,Gravel,Birmingham,2020-03-28,4,1599,2020-04-04,GIA,6396,7
...,...,...,...,...,...,...,...,...,...
1995,Road,Manchester,2020-06-02,2,3504,2020-06-20,SPEC,7008,18
1996,Mountain,York,2020-09-07,2,1109,2020-09-21,SPEC,2218,14
1997,Road,London,2020-12-10,3,1032,2020-12-11,SPEC,3096,1
1998,Gravel,London,2020-02-10,2,2303,2020-02-11,SPEC,4606,1


### Workout the Order Value using Value per Bike and Quantity.

In [61]:
df['Order Value'] = df['Quantity'] * df['Value per Bike']

### Aggregate Value per Bike, Order Value and Quantity by Brand and Bike Type to form:
- Quantity Sold
- Order Value
- Average Value Sold per Brand, Type

In [62]:
df_grouped = df.groupby(['Model','Bike Type']).agg(Quantity_Sold = ('Quantity','sum'), Order_Value = ('Order Value','sum')).reset_index()
df_grouped['avg_Value_Sold'] = df_grouped['Order_Value']/df_grouped['Quantity_Sold']
df_grouped

Unnamed: 0,Model,Bike Type,Quantity_Sold,Order_Value,avg_Value_Sold
0,BROM,Gravel,186,433885,2332.715054
1,BROM,Mountain,277,674770,2435.99278
2,BROM,Road,257,656539,2554.626459
3,GIA,Gravel,323,733087,2269.619195
4,GIA,Mountain,425,1021329,2403.127059
5,GIA,Road,407,896695,2203.181818
6,KONA,Gravel,324,791841,2443.953704
7,KONA,Mountain,330,820537,2486.475758
8,KONA,Road,273,647684,2372.468864
9,ORRO,Gravel,151,411644,2726.119205


### Calculate Days to ship by measuring the difference between when an order was placed and when it was shipped as 'Days to Ship'

In [92]:
#correct date format
df['Shipping Date'] = pd.to_datetime(df['Shipping Date'], format = '%d/%m/%Y')
df['Order Date'] = pd.to_datetime(df['Order Date'], format = '%d/%m/%Y')

# use datetime module to calculate the day difference, '.dt.days' to strip the string 'days'
df['Days to Ship'] = (df['Shipping Date'] - df['Order Date']).dt.days

df


Unnamed: 0,Bike Type,Store,Order Date,Quantity,Value per Bike,Shipping Date,Model,Order Value,Days to Ship
0,Mountain,Manchester,2020-05-15,4,1543,2020-06-01,GIA,6172,17
1,Gravel,Manchester,2020-06-16,2,2076,2020-06-24,GIA,4152,8
2,Road,Birmingham,2020-05-04,1,2616,2020-05-13,GIA,2616,9
3,Gravel,York,2020-09-05,2,1359,2020-09-19,GIA,2718,14
4,Gravel,Birmingham,2020-03-28,4,1599,2020-04-04,GIA,6396,7
...,...,...,...,...,...,...,...,...,...
1995,Road,Manchester,2020-06-02,2,3504,2020-06-20,SPEC,7008,18
1996,Mountain,York,2020-09-07,2,1109,2020-09-21,SPEC,2218,14
1997,Road,London,2020-12-10,3,1032,2020-12-11,SPEC,3096,1
1998,Gravel,London,2020-02-10,2,2303,2020-02-11,SPEC,4606,1


### Aggregate Order Value, Quantity and Days to Ship by Brand and Store to form:
- Total Quantity Sold
- Total Order Value
- Average Days to Ship

In [95]:
df_grouped2 = df.groupby(['Bike Type','Model']).agg(
Total_Quantity_Sold = ('Quantity','sum'),
Total_Order_Value = ('Order Value','sum'),
Avg_Days_to_Ship = ('Days to Ship','mean')
).reset_index()



### Round any averaged values to one decimal place to make the values easier to read

In [98]:
df_grouped['avg_Value_Sold'] = df_grouped['avg_Value_Sold'].round(1)
df_grouped2['Avg_Days_to_Ship'] = df_grouped2['Avg_Days_to_Ship'].round(1)

### Output both data sets

In [99]:
df_grouped.to_csv(r'output/2021-week2-output1.csv')
df_grouped2.to_csv(r'output/2021-week2-output2.csv')