In [20]:
import pandas as pd

In [21]:
file = '../data/unclean2020.csv'
nhl2020_data = pd.read_csv(file, encoding = "latin-1")

In [22]:
nhl2020_data.head()

Unnamed: 0.1,Unnamed: 0,Rk,Player,Age,Tm,Pos,GP,G,A,PTS,...,SH_A,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%
0,0,1,Justin Abdelkader,32,DET,LW,49,0,3,3,...,0,40,0.0,565,11:32,26,103,38,31,55.1
1,1,2,Pontus Aberg,26,TOR,LW,5,0,1,1,...,0,4,0.0,44,8:42,1,1,0,0,
2,2,3,Vitaly Abramov,21,OTT,RW,2,1,0,1,...,0,3,33.3,12,5:47,0,0,0,0,
3,3,4,Noel Acciari,28,FLA,C,66,20,7,27,...,0,108,18.5,1054,15:58,103,121,381,414,47.9
4,4,5,Andrew Agozzino,29,TOT,LW,22,1,2,3,...,0,10,10.0,162,7:21,7,25,38,37,50.7


In [23]:
df = nhl2020_data.drop(columns = ["Unnamed: 0", "Rk"])
df.head()

Unnamed: 0,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,PIM,...,SH_A,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%
0,Justin Abdelkader,32,DET,LW,49,0,3,3,-14,25,...,0,40,0.0,565,11:32,26,103,38,31,55.1
1,Pontus Aberg,26,TOR,LW,5,0,1,1,0,0,...,0,4,0.0,44,8:42,1,1,0,0,
2,Vitaly Abramov,21,OTT,RW,2,1,0,1,0,2,...,0,3,33.3,12,5:47,0,0,0,0,
3,Noel Acciari,28,FLA,C,66,20,7,27,2,21,...,0,108,18.5,1054,15:58,103,121,381,414,47.9
4,Andrew Agozzino,29,TOT,LW,22,1,2,3,3,4,...,0,10,10.0,162,7:21,7,25,38,37,50.7


In [24]:
# TOT represents players that were traded mid-season
df = df[df["Tm"] != "TOT"]
df["Tm"].unique()

array(['DET', 'TOR', 'OTT', 'FLA', 'PIT', 'ANA', 'CAR', 'MTL', 'LAK',
       'NJD', 'CBJ', 'NYR', 'CGY', 'PHI', 'WPG', 'EDM', 'NSH', 'BUF',
       'BOS', 'WSH', 'VAN', 'NYI', 'STL', 'COL', 'CHI', 'DAL', 'SJS',
       'VEG', 'TBL', 'MIN', 'ARI'], dtype=object)

In [25]:
# Sorts the columns first by team, then by points 
df = df.sort_values(['Tm', 'PTS'], ascending=[True, False])

In [26]:
# Creates a df of the top 3 defensemen by pts for each team
d_df = df[df["Pos"] == "D"]
d_df = d_df.groupby('Tm').head(3)

In [27]:
# Creates a df of the top 7 forwards by pts for each team
f_df = df[df["Pos"] != "D"]
f_df = f_df.groupby('Tm').head(7)

In [28]:
# Creates a df that is all the players who were NOT the top 7 fwds or top 3 dmen for their respective teams
all_pos_df = df.merge(f_df, indicator='i', how='outer').query('i == "left_only"').drop('i', 1)
all_pos_df = all_pos_df.merge(d_df, indicator='i', how='outer').query('i == "left_only"').drop('i', 1)
all_pos_df.head(50)

Unnamed: 0,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,PIM,...,SH_A,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%
2,Nick Ritchie,24,ANA,LW,41,8,11,19,3,78,...,0,70,11.4,590,14:23,13,79,1,7,12.5
3,Carter Rowney,30,ANA,RW,71,8,11,19,5,14,...,2,63,12.7,910,12:49,42,189,47,56,45.6
5,Troy Terry,22,ANA,C,47,4,11,15,-5,6,...,0,73,5.5,682,14:31,9,3,3,5,37.5
6,Nicolas Deslauriers,28,ANA,LW,59,7,6,13,-1,92,...,0,64,10.9,577,9:47,32,137,0,0,
7,Max Jones,21,ANA,LW,59,8,4,12,-6,36,...,0,102,7.8,842,14:16,18,82,11,16,40.7
8,Max Comtois,21,ANA,LW,29,5,6,11,-4,24,...,0,32,15.6,405,13:58,8,61,3,13,18.8
9,Jacob Larsson,22,ANA,D,60,2,9,11,-9,12,...,0,58,3.4,969,16:09,57,34,0,0,
10,Devin Shore,25,ANA,C,39,4,6,10,-8,8,...,0,47,8.5,468,11:59,13,47,39,42,48.1
11,Erik Gudbranson,28,ANA,D,44,4,5,9,0,91,...,1,76,5.3,882,20:02,56,76,0,0,
12,Josh Manson,28,ANA,D,50,1,8,9,-10,37,...,1,64,1.6,1032,20:38,65,113,0,0,


In [29]:
# Filters out all players who played fewer than 25 games. 25 GP is adjusted because the 19-20 season was cut short to only 68 games.
all_pos_df = all_pos_df[all_pos_df["GP"] > ((68/82) * 25)]
all_pos_df

Unnamed: 0,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,PIM,...,SH_A,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%
2,Nick Ritchie,24,ANA,LW,41,8,11,19,3,78,...,0,70,11.4,590,14:23,13,79,1,7,12.5
3,Carter Rowney,30,ANA,RW,71,8,11,19,5,14,...,2,63,12.7,910,12:49,42,189,47,56,45.6
5,Troy Terry,22,ANA,C,47,4,11,15,-5,6,...,0,73,5.5,682,14:31,9,3,3,5,37.5
6,Nicolas Deslauriers,28,ANA,LW,59,7,6,13,-1,92,...,0,64,10.9,577,9:47,32,137,0,0,
7,Max Jones,21,ANA,LW,59,8,4,12,-6,36,...,0,102,7.8,842,14:16,18,82,11,16,40.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,Brendan Leipsic,25,WSH,LW,61,3,8,11,3,13,...,0,49,6.1,559,9:10,20,57,9,12,42.9
723,Travis Boyd,26,WSH,C,24,3,7,10,9,2,...,0,26,11.5,282,11:45,9,14,50,69,42.0
724,Jonas Siegenthaler,22,WSH,D,64,2,7,9,11,43,...,0,49,4.1,1007,15:44,105,51,0,0,
725,Nick Jensen,29,WSH,D,68,0,8,8,1,13,...,0,61,0.0,1212,17:49,71,59,0,0,


In [41]:
age_adj_df = all_pos_df[(((all_pos_df["Pos"] == "D") & (all_pos_df["Age"] < 31)) | ((all_pos_df["Pos"] != "D") & (all_pos_df["Age"] < 30)))]
age_adj_df["Age"].max()

30

In [43]:
age_adj_df.head()

Unnamed: 0,Player,Age,Tm,Pos,GP,G,A,PTS,+/-,PIM,...,SH_A,S,S%,TOI,ATOI,BLK,HIT,FOW,FOL,FO%
2,Nick Ritchie,24,ANA,LW,41,8,11,19,3,78,...,0,70,11.4,590,14:23,13,79,1,7,12.5
5,Troy Terry,22,ANA,C,47,4,11,15,-5,6,...,0,73,5.5,682,14:31,9,3,3,5,37.5
6,Nicolas Deslauriers,28,ANA,LW,59,7,6,13,-1,92,...,0,64,10.9,577,9:47,32,137,0,0,
7,Max Jones,21,ANA,LW,59,8,4,12,-6,36,...,0,102,7.8,842,14:16,18,82,11,16,40.7
8,Max Comtois,21,ANA,LW,29,5,6,11,-4,24,...,0,32,15.6,405,13:58,8,61,3,13,18.8


In [44]:
age_adj_df.to_csv("../data/clean2020.csv")