### Prepping Data Challenge:  C&BSCo Summary Stats (week 35)

### Requirements
 - Input data
 - Merge km's and min's as Minutes
 - Split the unnamed column into
    - Coach
    - Calories
    - Music Type
 -  Convert the Dates to Years
 - Create a parameter to let the user select any speed as the average riding speed (KPH)
    - Your values may differ depending on my average speed
 - Create the following aggregations
    - Total Minutes
    - Total Minutes per Coach (find the most minutes per Coach)
    - Calories per Minute per Coach (find the max calories per minute per Coach)
    - Avg. Calories per Ride
    - Total Rides
    - Total Distance ((Mins/60)*Speed Parameter)
    - Avg. Calories per Minute 
- Combine all the answers and restructure your data if necessary
- Output the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Input the data
# Merge km's and min's as Minutes
df =  ( pd.read_csv(r"\Dataprep\2022\WK35 CEO Cycling.csv")
              .rename(columns={'Value' : 'Minutes'})
              .rename(columns=lambda c: 'Unnamed' if 'Unnamed' in c else c)
              .drop(columns=['Units', 'Type']))

df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)

In [3]:
df.head(10)

Unnamed: 0,Date,Minutes,Unnamed
0,2020-12-16,10,Bakari - 125 - everything rock
1,2020-12-16,10,Kym - 134 - everything rock
2,2020-12-17,30,Gregg - 375 - everything rock
3,2020-12-18,20,Kym - 232 - everything rock
4,2020-12-19,45,Bakari - 565 - latest hits
5,2020-12-21,20,Kym - 271 - hiphop
6,2020-12-23,20,Emily - 279 - latest hits
7,2020-12-24,45,Sherica - 588 - latest hits
8,2020-12-28,30,Emily - 401 - everything rock
9,2020-12-29,30,Kym - 445 - upbeat anthems


In [4]:
# Split the unnamed column into (Coach , Calories, Music Type)
df[['Coach', 'Calories', 'Music Type']] = df['Unnamed'].str.split('-', expand=True)

In [5]:
#Convert the Dates to Years
df['Year'] = df['Date'].dt.year

In [6]:
#Create a parameter to let the user select any speed as the average riding speed (KPH)
#    - Your values may differ depending on my average speed (I used 30 kph)
average_speed_kph = float(input("Enter the average riding speed (KPH): "))

In [7]:
#- Create the following aggregations (Total Minutes, Total Minutes per Coach (find the most minutes per Coach), Calories per Minute per Coach (find the max calories per minute per Coach),
#                                     Avg. Calories per Ride, Total Rides, Total Distance ((Mins/60)*Speed Parameter), Avg. Calories per Minute )

df['Distance'] = average_speed_kph * df['Minutes'] / 60
df['Calories'] = df['Calories'].astype(int)
    

df2 = ( df.groupby(['Year']).agg(Total_Mins=('Minutes', 'sum'),Total_Calories=('Calories', 'sum'),Total_Rides=('Minutes', 'count'), Total_Distance=('Distance', 'sum')))


df2['Avg. Calories per Ride'] = (df2['Total_Calories'] / df2['Total_Rides']).round(1)
df2['Avg. Calories per Minute'] = (df2['Total_Calories'] / df2['Total_Mins']).round(1)


# aggregate by year and coach
dfcoach = ( df.groupby(['Year', 'Coach'], as_index=False).agg(Mins=('Minutes', 'sum'),Calories=('Calories', 'sum')))
    
dfcoach['Calories per Min'] = dfcoach['Calories'] / dfcoach['Mins']
dfcoach['Calories per Minute per Coach'] = ( dfcoach['Coach'] + ' (' + (dfcoach['Calories per Min']).round(1).astype(str) + ')' )
dfcoach['Total Mins per Coach'] = dfcoach['Coach'] + ' (' + dfcoach['Mins'].astype(str) + ')'

In [8]:
df2.head()

Unnamed: 0_level_0,Total_Mins,Total_Calories,Total_Rides,Total_Distance,Avg. Calories per Ride,Avg. Calories per Minute
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020.0,305,4078,11,152.5,370.7,13.4
2021.0,2145,31323,75,1072.5,417.6,14.6


In [9]:
dfcoach.head()

Unnamed: 0,Year,Coach,Mins,Calories,Calories per Min,Calories per Minute per Coach,Total Mins per Coach
0,2020.0,Bakari,55,690,12.545455,Bakari (12.5),Bakari (55)
1,2020.0,Emily,50,680,13.6,Emily (13.6),Emily (50)
2,2020.0,Gregg,30,375,12.5,Gregg (12.5),Gregg (30)
3,2020.0,Kym,80,1082,13.525,Kym (13.5),Kym (80)
4,2020.0,Sherica,90,1251,13.9,Sherica (13.9),Sherica (90)


In [10]:
#Combine all the answers and restructure your data if necessary
    # add the coach aggregations to the total df
dfcombine = ( pd.concat([df2.drop(columns=['Total_Calories']), dfcoach.iloc[dfcoach.groupby('Year')['Mins'].idxmax()]
                                                .set_index('Year')['Total Mins per Coach'],
                          dfcoach.iloc[dfcoach.groupby('Year')['Calories per Min'].idxmax()]
                                                .set_index('Year')['Calories per Minute per Coach']],
                          axis=1).rename(columns=lambda c: c.replace('_', ' ')))    
    
 
# reshape the df so measures are in rows and years are in cols, then output the data  
(dfcombine.melt(ignore_index=False, var_name='Measure')
            .pivot_table(index='Measure', values='value', columns='Year', aggfunc='first')
            .reset_index()
            .rename(columns=lambda c: str(c).replace('-', '')))

Year,Measure,2020.0,2021.0
0,Avg. Calories per Minute,13.4,14.6
1,Avg. Calories per Ride,370.7,417.6
2,Calories per Minute per Coach,Sherica (13.9),Bakari (15.2)
3,Total Distance,152.5,1072.5
4,Total Mins,305,2145
5,Total Mins per Coach,Sherica (90),Kym (470)
6,Total Rides,11,75


In [11]:
#output the data
dfcombine.to_csv('wk35-output.csv', index=False)