In [34]:
#imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#read in the dataset we scraped from Elite Prospects
df = pd.read_csv('../hockey_final_project/hockey_data.csv')

#when we created the dataframe saved it and now loaded it back in, a new index was asigned to the df which is fine
#we will the old index as well as the player category as the information is already in seperate columns (playername and position)
df.drop(['Unnamed: 0', 'player'], axis=1, inplace=True)

#looking at the data every name in the dataset has a trailing whitespace which is extremely annoying to deal with so we'll fix that
df['playername'] = df['playername'].str.strip()

#convert our column with numeric data to actual numbers instead of objects so we can preform calculations on them
df[['gp', 'g', 'a', 'tp', 'ppg', 'pim', '+/-']] = df[['gp', 'g', 'a', 'tp', 'ppg', 'pim', '+/-']].apply(pd.to_numeric, errors='coerce')

#fill all nans with 0s as the data is still important these players potentially are ltir types of players or they never actually played but were listed as avaliable
df.fillna(0, inplace=True)

#now going to convert columns that should be ints back to int as all column where converted to doubles
df[['gp', 'g', 'a', 'tp', 'pim', '+/-']] = df[['gp', 'g', 'a', 'tp', 'pim', '+/-']].astype(int)

#create new column for goals per game, assists per game, penalty minutes per game and plus minus per game to better classify what type of player the individual is
#a player with a significantly higher goals per game than as assists per game would make that player a better scorer or more efficient
#using per game averages is significantly better than raw totals as a player who is elite maybe have had a season ending injury half way through the season
#pro rating stats is common for the analytics communities
df['gpg'] =  df.g/df.gp.replace({0 : np.inf})
df['apg'] =  df.a/df.gp.replace({0 : np.inf})
df['pmpg'] =  df.pim/df.gp.replace({0 : np.inf})
df['+/-pg'] =  df['+/-']/df.gp.replace({0 : np.inf})

#reshape season data to convert to datetime
df['season'] = df['season'].str[:4]

#convert to datetime
df['season'] = pd.to_datetime(df['season'], format='%Y')
#df['season'] = df['season'].dt.year

#create a new df that will contain the features that we would like to pass to the model
#df_model = df[['gp', 'tp', 'ppg', 'gpg', 'apg', 'pmpg', '+/-pg', 'season', 'link']]
#df_model
#this code will create train and test splits for the data which utilizes sample from pandas
#this runs faster than using np rand
#80 percentish train, and 15 percentish test
#UNCOMMENT THESE TWO LINES BELLOW TO CREATE SPLITS FOR MODEL WHEN READY!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#train = df_model.sample(frac=0.85, random_state=100)
#test = df_model.drop(train.index)

Unnamed: 0,gp,tp,ppg,gpg,apg,pmpg,+/-pg,season,link
0,17,32,1.88,1.294118,0.588235,2.352941,0.000000,1918-01-01,https://www.eliteprospects.com/player/23540/ne...
1,18,28,1.56,1.055556,0.500000,1.666667,0.000000,1918-01-01,https://www.eliteprospects.com/player/24049/fr...
2,18,27,1.50,1.166667,0.333333,1.666667,0.000000,1918-01-01,https://www.eliteprospects.com/player/189452/o...
3,18,24,1.33,1.000000,0.333333,3.055556,0.000000,1918-01-01,https://www.eliteprospects.com/player/23973/cy...
4,17,19,1.12,0.823529,0.294118,0.882353,0.000000,1918-01-01,https://www.eliteprospects.com/player/122581/d...
...,...,...,...,...,...,...,...,...,...
45190,18,0,0.00,0.000000,0.000000,0.111111,-0.277778,2021-01-01,https://www.eliteprospects.com/player/312218/m...
45191,24,0,0.00,0.000000,0.000000,3.208333,-0.250000,2021-01-01,https://www.eliteprospects.com/player/64567/ma...
45192,0,0,0.00,0.000000,0.000000,0.000000,0.000000,2021-01-01,https://www.eliteprospects.com/player/220059/f...
45193,0,0,0.00,0.000000,0.000000,0.000000,0.000000,2021-01-01,https://www.eliteprospects.com/player/245108/z...


In [37]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#read in our nhl api dataset
df_nhl_api = pd.read_csv('../hockey_final_project/nhl_api_dataset.csv')

df_nhl_api.drop(['Unnamed: 0'], axis=1, inplace=True)

#i'd like to convert the height to just inches to be able to pass it into the model as it may be useful
#i'm also going to need to convert all of the columns to a per game basis as raw numbers are not very useful at all when comparing players
#someone who played at a 1.5ppg pace in 50 games is better than someone who played at 1 ppg pace in 82 games as most likely pro rating their stats they would obviously have more

#this function will covert our string into two seperate values and remove the inch and ft symbols
#it will then return the acutal intager value to the dataframe column
def single_inch_value(value):

    feet, inches = value.replace("' ", " ").replace("\"", "").split(" ")
    
    return int(feet)*12 + int(inches)

#convert the height column
df_nhl_api['height'] = df_nhl_api['height'].apply(single_inch_value)

#we'll convert the whole dataframe into a per game basis this includes the games column as there are 82 games in a season and neural network function better-
#with vectors between -1 and 1 so we'll try and get them as close as possible to being between those values
#we'll adjust the first seasons first
df_nhl_api['ppg_1'] = (df_nhl_api['goals_1'] + df_nhl_api['assists_1']) / df_nhl_api['games_1']
df_nhl_api['goals_1'] = df_nhl_api['goals_1'] / df_nhl_api['games_1']
df_nhl_api['assists_1'] = df_nhl_api['assists_1'] / df_nhl_api['games_1']
df_nhl_api['hits_1'] = df_nhl_api['hits_1'] / df_nhl_api['games_1']
df_nhl_api['shots_1'] = df_nhl_api['shots_1'] / df_nhl_api['games_1']
df_nhl_api['pim_1'] = df_nhl_api['pim_1'] / df_nhl_api['games_1']
df_nhl_api['time_1'] = df_nhl_api['time_1'].apply(lambda string: int(str(string).split(':')[0])) / df_nhl_api['games_1']
df_nhl_api['games_1'] = df_nhl_api['games_1'] / 82

#now we'll convert the second season to ppg
df_nhl_api['ppg_2'] = (df_nhl_api['goals_2'] + df_nhl_api['assists_2']) / df_nhl_api['games_2']
df_nhl_api['goals_2'] = df_nhl_api['goals_2'] / df_nhl_api['games_2']
df_nhl_api['assists_2'] = df_nhl_api['assists_2'] / df_nhl_api['games_2']
df_nhl_api['hits_2'] = df_nhl_api['hits_2'] / df_nhl_api['games_2']
df_nhl_api['shots_2'] = df_nhl_api['shots_2'] / df_nhl_api['games_2']
df_nhl_api['pim_2'] = df_nhl_api['pim_2'] / df_nhl_api['games_2']
df_nhl_api['time_2'] = df_nhl_api['time_2'].apply(lambda string: int(str(string).split(':')[0])) / df_nhl_api['games_2']
df_nhl_api['games_2'] = df_nhl_api['games_2'] / 82


#we'll create a new df with the feature we want for the model
df_model = df_nhl_api[['games_1','games_2','goals_1','goals_2','assists_1','assists_2','hits_1','hits_2','pim_1','pim_2','plus_minus_1','plus_minus_2',
                        'shots_1','shots_2','time_1','time_2','ppg_1','ppg_2','ppg_3','weight','height','position']]

#next so we can actually pass the postion that the player plays we need to One hot encode it
df_model = pd.get_dummies(df_model, columns=['position'])

#now we can finally deal with the null values that arise because when the data was gathered the player didn't actually register anything that we requested yet we still recieved data
#this happens because any play that is listed as long term injury reserve is still and eligible player to play even though most are injury retired
#I don't want to remove them as this could introduce some selection bias so i'll just replace null with zeros
df_model.fillna(0, inplace=True)

df_model

Unnamed: 0,games_1,games_2,goals_1,goals_2,assists_1,assists_2,hits_1,hits_2,pim_1,pim_2,...,time_2,ppg_1,ppg_2,ppg_3,weight,height,position_C,position_D,position_L,position_R
0,0.804878,1.000000,0.348485,0.317073,0.378788,0.426829,2.348485,2.195122,0.712121,0.597561,...,18.304878,0.727273,0.743902,1.083333,188,72,0,0,1,0
1,1.000000,0.585366,0.317073,0.458333,0.426829,0.625000,2.195122,2.166667,0.597561,0.812500,...,18.020833,0.743902,1.083333,0.871795,188,72,0,0,1,0
2,0.585366,0.951220,0.458333,0.448718,0.625000,0.423077,2.166667,2.012821,0.812500,0.846154,...,19.153846,1.083333,0.871795,0.540541,188,72,0,0,1,0
3,0.951220,0.902439,0.448718,0.229730,0.423077,0.310811,2.012821,2.851351,0.846154,0.756757,...,17.878378,0.871795,0.540541,0.500000,188,72,0,0,1,0
4,0.902439,0.975610,0.229730,0.212500,0.310811,0.287500,2.851351,3.300000,0.756757,0.512500,...,16.812500,0.540541,0.500000,0.408451,188,72,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7979,0.073171,0.817073,0.333333,0.164179,0.000000,0.268657,0.333333,0.492537,0.333333,0.328358,...,14.880597,0.333333,0.432836,0.475000,185,70,1,0,0,0
7980,0.817073,0.975610,0.164179,0.187500,0.268657,0.287500,0.492537,0.650000,0.328358,0.350000,...,15.600000,0.432836,0.475000,0.469136,185,70,1,0,0,0
7981,0.975610,0.987805,0.187500,0.160494,0.287500,0.308642,0.650000,0.234568,0.350000,0.296296,...,13.802469,0.475000,0.469136,0.426471,185,70,1,0,0,0
7982,0.987805,0.829268,0.160494,0.147059,0.308642,0.279412,0.234568,0.176471,0.296296,0.147059,...,14.544118,0.469136,0.426471,0.302326,185,70,1,0,0,0


In [21]:
#create an LSTM to predict the ppg for their next season if they were to play for each player that has played in the league
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.preprocessing.sequence import pad_sequences

df_nhl_api

Unnamed: 0,name,position,country,birthday,id,height,weight,goals_1,assists_1,pim_1,...,games_2,hits_2,shots_2,time_2,plus_minus_2,team_2,season_1,season_2,season_3,ppg_3
0,Chris Kunitz,L,CAN,1979-09-26,8470543,72,188,23,25,47,...,82,180,230,1501:48,16,Pittsburgh Penguins,20102011,20112012,20122013,1.083333
1,Chris Kunitz,L,CAN,1979-09-26,8470543,72,188,26,35,49,...,48,104,113,865:04,30,Pittsburgh Penguins,20112012,20122013,20132014,0.871795
2,Chris Kunitz,L,CAN,1979-09-26,8470543,72,188,22,30,39,...,78,157,218,1494:16,25,Pittsburgh Penguins,20122013,20132014,20142015,0.540541
3,Chris Kunitz,L,CAN,1979-09-26,8470543,72,188,35,33,66,...,74,211,170,1323:47,2,Pittsburgh Penguins,20132014,20142015,20152016,0.500000
4,Chris Kunitz,L,CAN,1979-09-26,8470543,72,188,17,23,56,...,80,264,150,1345:58,29,Pittsburgh Penguins,20142015,20152016,20162017,0.408451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7979,Derek Ryan,C,USA,1986-12-29,8478585,70,185,2,0,2,...,67,33,73,997:44,-8,Carolina Hurricanes,20152016,20162017,20172018,0.475000
7980,Derek Ryan,C,USA,1986-12-29,8478585,70,185,11,18,22,...,80,52,135,1248:29,-15,Carolina Hurricanes,20162017,20172018,20182019,0.469136
7981,Derek Ryan,C,USA,1986-12-29,8478585,70,185,15,23,28,...,81,19,103,1118:07,21,Calgary Flames,20172018,20182019,20192020,0.426471
7982,Derek Ryan,C,USA,1986-12-29,8478585,70,185,13,25,24,...,68,12,102,989:14,9,Calgary Flames,20182019,20192020,20202021,0.302326
