In [24]:
#https://preppindata.blogspot.com/2021/02/2021-week-6-comparing-prize-money-for.html

import pandas as pd

### Input the data

In [25]:
df = pd.read_excel(r'data/PD 2021 Wk 6 Input.xlsx', sheet_name = 'OfficialMoney')

df

Unnamed: 0,PLAYER NAME,MONEY,EVENTS,TOUR
0,Brooks Koepka,9684006,21,PGA
1,Rory McIlroy,7785286,19,PGA
2,Matt Kuchar,6294690,22,PGA
3,Patrick Cantlay,6121488,21,PGA
4,Gary Woodland,5690965,24,PGA
...,...,...,...,...
195,Lindy Duncan,130654,25,LPGA
196,Daniela Darquea,129608,21,LPGA
197,Maria Fassi,129164,11,LPGA
198,Stephanie Meadow,127796,21,LPGA


### Answer these questions:


In [26]:
# Q1: What's the Total Prize Money earned by players for each tour?
df_Q1 = df.groupby(['TOUR']).agg(total_prize_money = ('MONEY','sum')).reset_index()
df_Q1

Unnamed: 0,TOUR,total_prize_money
0,LPGA,58410411
1,PGA,256726356


In [27]:
# Q2. How many players are in this dataset for each tour?
df_Q2 = df.groupby(['TOUR']).agg(number_of_players = ('PLAYER NAME','count')).reset_index()
df_Q2

Unnamed: 0,TOUR,number_of_players
0,LPGA,100
1,PGA,100


In [28]:
# Q3. How many events in total did players participate in for each tour?
df_Q3 = df.groupby(['TOUR']).agg(number_of_events = ('EVENTS','sum')).reset_index()
df_Q3


Unnamed: 0,TOUR,number_of_events
0,LPGA,2266
1,PGA,2282


In [29]:
# Q4. How much do players win per event? What's the average of this for each tour? 
df['WIN PER EVENT'] = df['MONEY']/df['EVENTS']
df_Q4 = df.groupby(['TOUR']).agg(avg_win_per_event = ('WIN PER EVENT','mean')).reset_index()
df_Q4

Unnamed: 0,TOUR,avg_win_per_event
0,LPGA,25525.30112
1,PGA,120281.569273


In [30]:
# Q5. How do players rank by prize money for each tour? What about overall? 


df['rank_by_tour'] = df.groupby('TOUR')['MONEY'].rank(ascending=False)
df['rank_overalll'] = df['MONEY'].rank(ascending=False)

# What is the average difference between where they are ranked within their tour compared to the overall rankings where both tours are combined?
df['difference'] = df['rank_overalll'] - df['rank_by_tour']
df_Q5 = df.groupby(['TOUR']).agg(avg_difference = ('difference','mean')).reset_index()
df_Q5

Unnamed: 0,TOUR,avg_difference
0,LPGA,96.13
1,PGA,3.87


### Combine the answers to these questions into one dataset

In [40]:
df_Qcombined = df_Q1
Qs = [df_Q2,df_Q3, df_Q4,df_Q5]

for df in Qs:
    df_Qcombined = df_Qcombined.merge(df, on='TOUR', how='inner')

df_Qcombined

Unnamed: 0,TOUR,total_prize_money,number_of_players,number_of_events,avg_win_per_event,avg_difference
0,LPGA,58410411,100,2266,25525.30112,96.13
1,PGA,256726356,100,2282,120281.569273,3.87


### Pivot the data so that we have a column for each tour, with each row representing an answer to the above questions

In [55]:
pd.options.display.float_format = '{:.0f}'.format

df_melt = df_Qcombined.melt(id_vars=['TOUR'],
                  var_name='Measures',
                  value_name='Values')


df_output = df_melt.pivot(index='Measures', columns='TOUR', values='Values').reset_index()
df_output

TOUR,Measures,LPGA,PGA
0,avg_difference,96,4
1,avg_win_per_event,25525,120282
2,number_of_events,2266,2282
3,number_of_players,100,100
4,total_prize_money,58410411,256726356


### Clean up the Measure field and create a new column showing the difference between the tours for each measure

In [59]:
df_output = df_output[['Measures','PGA','LPGA']]
df_output['Difference between tours'] =  df_output['LPGA'] - df_output['PGA']
df_output

TOUR,Measures,PGA,LPGA,Difference between tours
0,avg_difference,4,96,92
1,avg_win_per_event,120282,25525,-94756
2,number_of_events,2282,2266,-16
3,number_of_players,100,100,0
4,total_prize_money,256726356,58410411,-198315945


### Output the data

In [60]:
df_output.to_csv(r'output/2021-week6-output.csv')