In [193]:
# https://preppindata.blogspot.com/2021/03/2021-week-13.html

import pandas as pd
import numpy as np
import glob

### Input all the files

In [194]:
file_path = glob.glob(r'data\PD 2021 Wk 13 Input\\' + '*.csv')
file_path

df = pd.concat((pd.read_csv(file) for file in file_path), ignore_index=True)
df.info

<bound method DataFrame.info of        Season                    Name    Position  Appearances  Clean sheets  \
0     2015-16         Rolando Aarons   Midfielder           10           NaN   
1     2015-16             Almen Abdi   Midfielder           32           NaN   
2     2015-16      Abdul Rahman Baba     Defender           15           2.0   
3     2015-16            Mehdi Abeid   Midfielder            0           NaN   
4     2015-16          Tammy Abraham      Forward            2           NaN   
...       ...                     ...         ...          ...           ...   
4242  2019-20   Christoph Zimmermann     Defender           17           3.0   
4243  2019-20    Oleksandr Zinchenko     Defender           19           NaN   
4244  2019-20      Richairo Zivkovic      Forward            5           NaN   
4245  2019-20  Nabili Zoubdi Touaizi      Forward            0           NaN   
4246  2019-20             Kurt Zouma     Defender           28           7.0   

      G

### Remove all goalkeepers from the data set & Remove all records where appearances = 0	


In [195]:
df = df[df['Position'] != 'Goalkeeper']
df = df[df['Appearances'] != 0]
df

Unnamed: 0,Season,Name,Position,Appearances,Clean sheets,Goals conceded,Tackles,Tackle success %,Last man tackles,Blocked shots,...,Shooting accuracy %,Big chances missed,Saves,Penalties saved,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks
0,2015-16,Rolando Aarons,Midfielder,10,,,13.0,77%,,0.0,...,50%,0.0,,,,,,,,
1,2015-16,Almen Abdi,Midfielder,32,,,83.0,78%,,10.0,...,26%,1.0,,,,,,,,
2,2015-16,Abdul Rahman Baba,Defender,15,2.0,13.0,47.0,83%,0.0,1.0,...,,,,,,,,,,
4,2015-16,Tammy Abraham,Forward,2,,,0.0,,,1.0,...,0%,0.0,,,,,,,,
5,2015-16,Charlie Adam,Midfielder,22,,,18.0,78%,,9.0,...,14%,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4237,2019-20,Wilfried Zaha,Forward,38,,,43.0,,,22.0,...,28%,5.0,,,,,,,,
4242,2019-20,Christoph Zimmermann,Defender,17,3.0,27.0,32.0,66%,1.0,1.0,...,,,,,,,,,,
4243,2019-20,Oleksandr Zinchenko,Defender,19,,,31.0,61%,,7.0,...,19%,0.0,,,,,,,,
4244,2019-20,Richairo Zivkovic,Forward,5,,,2.0,,,1.0,...,0%,0.0,,,,,,,,


### In this challenge we are interested in the goals scored from open play
- Create a new “Open Play Goals” field (the goals scored from open play is the number of goals scored that weren’t penalties or freekicks)
- Note some players will have scored free kicks or penalties with their left or right foot
- Be careful how Prep handles null fields! (have a look at those penalty and free kick fields) 
- Rename the original Goals scored field to Total Goals Scored

In [196]:
#Create a new “Open Play Goals” field (the goals scored from open play is the number of goals scored that weren’t penalties or freekicks)
df['Open Play Goals'] = df['Goals'] - (df['Penalties scored'] + df['Freekicks scored'])

#Rename the original Goals scored field to Total Goals Scored
df.rename(columns={'Goals':'Total Goals'}, inplace=True)

#substitute na with 0
df = df.fillna(0)
df

Unnamed: 0,Season,Name,Position,Appearances,Clean sheets,Goals conceded,Tackles,Tackle success %,Last man tackles,Blocked shots,...,Big chances missed,Saves,Penalties saved,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Open Play Goals
0,2015-16,Rolando Aarons,Midfielder,10,0.0,0.0,13.0,77%,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2015-16,Almen Abdi,Midfielder,32,0.0,0.0,83.0,78%,0.0,10.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2015-16,Abdul Rahman Baba,Defender,15,2.0,13.0,47.0,83%,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2015-16,Tammy Abraham,Forward,2,0.0,0.0,0.0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2015-16,Charlie Adam,Midfielder,22,0.0,0.0,18.0,78%,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4237,2019-20,Wilfried Zaha,Forward,38,0.0,0.0,43.0,0,0.0,22.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4242,2019-20,Christoph Zimmermann,Defender,17,3.0,27.0,32.0,66%,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4243,2019-20,Oleksandr Zinchenko,Defender,19,0.0,0.0,31.0,61%,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4244,2019-20,Richairo Zivkovic,Forward,5,0.0,0.0,2.0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculate the totals for each of the key metrics across the whole time period for each player, (be careful not to lose their position)

In [197]:
df_agg = df.groupby(['Name', 'Position']).sum().reset_index()
df_agg


Unnamed: 0,Name,Position,Season,Appearances,Clean sheets,Goals conceded,Tackles,Tackle success %,Last man tackles,Blocked shots,...,Big chances missed,Saves,Penalties saved,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Open Play Goals
0,Aaron Connolly,Forward,2019-20,24,0.0,0.0,7.0,0,0.0,13.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,Aaron Cresswell,Defender,2015-162016-172017-182018-192019-20,150,31.0,222.0,165.0,61%71%62%55%60%,1.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aaron Lennon,Midfielder,2015-162016-172017-182018-192019-20,97,0.0,0.0,125.0,71%92%65%62%57%,0.0,14.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,Aaron Mooy,Midfielder,2017-182018-192019-20,96,0.0,0.0,201.0,59%68%65%,0.0,37.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,Aaron Ramsey,Midfielder,2015-162016-172017-182018-19,106,0.0,0.0,168.0,76%48%77%53%,0.0,58.0,...,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,Álvaro Arbeloa,Defender,2016-17,3,0.0,9.0,5.0,80%,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,Álvaro Morata,Forward,2017-182018-19,47,0.0,0.0,21.0,0,0.0,25.0,...,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
1017,Álvaro Negredo,Forward,2016-17,36,0.0,0.0,18.0,0,0.0,19.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
1018,Çaglar Söyüncü,Defender,2018-192019-20,40,12.0,37.0,65.0,40%67%,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create an open play goals per appearance field across the whole time period

In [198]:
df_agg['Open Play Goals / Game'] = df_agg['Open Play Goals'] / df_agg['Appearances']
df_agg.head(3)

Unnamed: 0,Name,Position,Season,Appearances,Clean sheets,Goals conceded,Tackles,Tackle success %,Last man tackles,Blocked shots,...,Saves,Penalties saved,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Open Play Goals,Open Play Goals / Game
0,Aaron Connolly,Forward,2019-20,24,0.0,0.0,7.0,0,0.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.125
1,Aaron Cresswell,Defender,2015-162016-172017-182018-192019-20,150,31.0,222.0,165.0,61%71%62%55%60%,1.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aaron Lennon,Midfielder,2015-162016-172017-182018-192019-20,97,0.0,0.0,125.0,71%92%65%62%57%,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.061856


### Rank the players for the amount of open play goals scored across the whole time period, we are only interested in the top 20 (including those that are tied for position) - Output1

In [199]:
df_output1 = df_agg.copy()
df_output1
df_output1['Rank'] = df_output1['Open Play Goals'].rank(ascending=False)
df_output1['Rank'] = df_output1['Rank'].astype(int)

df_output1 = df_output1[df_output1['Rank']<=20]
df_output1.sort_values(by='Rank', ascending=True, inplace=True)

df_output1 = df_output1[['Open Play Goals','Goals with right foot','Goals with left foot','Position','Appearances','Rank','Total Goals','Open Play Goals / Game','Headed goals','Name']]
df_output1.head(3)

Unnamed: 0,Open Play Goals,Goals with right foot,Goals with left foot,Position,Appearances,Rank,Total Goals,Open Play Goals / Game,Headed goals,Name
885,86.0,68.0,17.0,Forward,143,1,102,0.601399,16.0,Sergio Agüero
428,80.0,61.0,25.0,Forward,177,2,98,0.451977,12.0,Jamie Vardy
854,74.0,44.0,20.0,Forward,164,3,74,0.45122,10.0,Sadio Mané
837,68.0,18.0,37.0,Forward,140,4,71,0.485714,15.0,Romelu Lukaku
719,66.0,11.0,58.0,Forward,108,5,73,0.611111,4.0,Mohamed Salah
832,55.0,29.0,14.0,Forward,175,6,57,0.314286,13.0,Roberto Firmino
902,53.0,29.0,21.0,Forward,160,7,53,0.33125,3.0,Son Heung-Min
83,49.0,36.0,11.0,Forward,145,8,51,0.337931,4.0,Anthony Martial
236,48.0,31.0,10.0,Midfielder,156,9,50,0.307692,9.0,Dele Alli
368,48.0,32.0,15.0,Forward,75,9,55,0.64,7.0,Harry Kane


### Rank the players for the amount of open play goals scored across the whole time period by position, we are only interested in the top 20 (including those that are tied for position) – Output 2

In [202]:
df_output2 = df_agg.copy()

df_output2['Rank'] = df_output2.groupby('Position')['Open Play Goals'].rank(ascending=False)
df_output2['rank'] = df_output2['Rank'].astype(int)
df_output2 = df_output2[df_output2['Rank']<=20]
df_output2.sort_values(by='Rank', ascending=True, inplace=True)

df_output2 = df_output2[['Open Play Goals','Goals with right foot','Goals with left foot','Position','Appearances','Rank','Total Goals','Open Play Goals / Game','Headed goals','Name']]
df_output2.head(3)

Unnamed: 0,Open Play Goals,Goals with right foot,Goals with left foot,Position,Appearances,Rank,Total Goals,Open Play Goals / Game,Headed goals,Name
885,86.0,68.0,17.0,Forward,143,1.0,102,0.601399,16.0,Sergio Agüero
236,48.0,31.0,10.0,Midfielder,156,1.0,50,0.307692,9.0,Dele Alli
577,1.0,1.0,0.0,Defender,18,1.0,1,0.055556,0.0,Kortney Hause


### Output the data


In [201]:
df_output1.to_csv(r'output/2021-week13-output1.csv')
df_output2.to_csv(r'output/2021-week13-output2.csv')