In [140]:
#imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.mode.chained_assignment = None  # default='warn'


#read in the dataset we scraped from Elite Prospects
df = pd.read_csv('../hockey_final_project/hockey_data.csv')

#when we created the dataframe saved it and now loaded it back in, a new index was asigned to the df which is fine
#we will the old index as well as the player category as the information is already in seperate columns (playername and position)
df.drop(['Unnamed: 0', 'player'], axis=1, inplace=True)

#looking at the data every name in the dataset has a trailing whitespace which is extremely annoying to deal with so we'll fix that
df['playername'] = df['playername'].str.strip()

#then we'll need to remove accents on certain names we need to replace them with the regular variant
df['playername'] = df['playername'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

#convert our column with numeric data to actual numbers instead of objects so we can preform calculations on them
df[['gp', 'g', 'a', 'tp', 'ppg', 'pim', '+/-']] = df[['gp', 'g', 'a', 'tp', 'ppg', 'pim', '+/-']].apply(pd.to_numeric, errors='coerce')

#fill all nans with 0s as the data is still important these players potentially are ltir types of players or they never actually played but were listed as avaliable
df.fillna(0, inplace=True)

#now going to convert columns that should be ints back to int as all column where converted to doubles
df[['gp', 'g', 'a', 'tp', 'pim', '+/-']] = df[['gp', 'g', 'a', 'tp', 'pim', '+/-']].astype(int)

#create new column for goals per game, assists per game, penalty minutes per game and plus minus per game to better classify what type of player the individual is
#a player with a significantly higher goals per game than as assists per game would make that player a better scorer or more efficient
#using per game averages is significantly better than raw totals as a player who is elite maybe have had a season ending injury half way through the season
#pro rating stats is common for the analytics communities
df['gpg'] =  df['g']/df['gp'].replace({0 : np.inf})
df['apg'] =  df['a']/df['gp'].replace({0 : np.inf})
df['pmpg'] =  df['pim']/df['gp'].replace({0 : np.inf})
df['+/-pg'] =  df['+/-']/df['gp'].replace({0 : np.inf})

#reshape season data to convert to datetime
df['season'] = df['season'].str[:4]

#convert to datetime
df['season'] = pd.to_datetime(df['season'], format='%Y')
#df['season'] = df['season'].dt.year

#create a new df that will contain the features that we would like to pass to the model
df_model = df[['gp', 'tp', 'ppg', 'gpg', 'apg', 'pmpg', '+/-pg', 'season', 'playername', 'link', 'position']]

#create a dict which links our playername column and link columns together
#the reason this is actually important as there are a few player that have the same names
#however their player id for their specific elite prospects page is acutally different which makes it easy to identify who we're talking about
#their name and link to the EP page will be returned for the closest compareable player using KNN, so someone can take a look at their pages
players_and_ep_links = pd.Series(df_model['link'].values,index=df['playername']).to_dict()


#create a primary position for all players so we can reduce the amount of-
#-columns created from pd.get_dummies, the first letter is the primary position the player-
#played so taking position 0 works fine in this case
df_model['position'] = df_model['position'].str[0]

#i would like to enncode our position column to and pass it into the model
#some players played multiple postitions like position_f stands for forward and position_w stands for wing so this could potentially cause issues however that remains to be seen
df_model = pd.get_dummies(df_model, columns=['position'])

#create a dict which links our playername column and link columns together
#the reason this is actually important as there are a few player that have the same names
#however their player id for their specific elite prospects page is acutally different which makes it easy to identify who we're talking about
#their name and link to the EP page will be returned for the closest compareable player using KNN, so someone can take a look at their pages
players_and_ep_links = pd.Series(df_model['link'].values, index=df['playername']).to_dict()


In [141]:
#create KNN to find closest compareable player given sum of career stats
from sklearn.neighbors import NearestNeighbors

#create a new df for the KNN
df_knn = df_model.drop(['season', 'link'], axis=1)

#create a groupby aggregating all the stats to get career averages
group = df_knn.groupby('playername').agg({'gp': ['sum'], 'tp': ['sum'], 'ppg': ['mean'], 'gpg': ['mean'],
                                    'apg': ['mean'], 'pmpg': ['mean'], '+/-pg': ['mean'],
                                    'position_C': ['mean'], 'position_D': ['mean'], 'position_F': ['mean'], 
                                    'position_L': ['mean'], 'position_R': ['mean'], 'position_W': ['mean']})

#drop column level 1 which is the mean and sum as we have an multiindex and it's a bit of a pain to work with it
group.columns = group.columns.droplevel(1)

#create a nearest neighbours for our dataset and fit it
nn= NearestNeighbors(radius=0.5, algorithm='auto')
KNN_model = nn.fit(group)


#create function to return our closest compareable player along with their stats which is pulled directly from the knn_dataframe
def compareable(dataset, model, player):

    #get the closest compareable player for our asked about player
    comparable = model.kneighbors([dataset.loc[player,:]], 2, False)

    #convert array to list which is the index in our group dataframe of which player is closest to them
    comparable = list(comparable[0])

    return print(dataset.iloc[comparable])


compareable(group, KNN_model, 'Sidney Crosby')

                 gp    tp       ppg       gpg       apg      pmpg     +/-pg  \
playername                                                                    
Sidney Crosby  1103  1405  1.306471  0.466808  0.838021  0.639822  0.222552   
Guy Lafleur    1126  1353  1.136471  0.466961  0.669305  0.356242  0.341604   

               position_C  position_D  position_F  position_L  position_R  \
playername                                                                  
Sidney Crosby         1.0         0.0         0.0         0.0         0.0   
Guy Lafleur           0.0         0.0         0.0         0.0         1.0   

               position_W  
playername                 
Sidney Crosby           0  
Guy Lafleur             0  


In [238]:
#create an LSTM to predict the ppg for their next season if they were to play for each player that has played in the league
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.preprocessing.sequence import pad_sequences

#create a function to change column to datetime not really important just helps keep code nice
#set datetime column to index so we can pass to LSTM
def set_datetime(df):

    df_datetime = df.set_index('season')

    return df_datetime



#create a new dataframe with a timeseries index to use with our LSTM model
df_date = set_datetime(df_model)

#shift our dataset to predict ppg for any given player
#this will use the two previous seasons to predict the next season
#we can use the last row entry for each user link to predict the next season if they were to play so like [-1] when we request the result back from the LSTM

columns_to_shift = ['gp','tp','ppg','gpg','apg','pmpg','+/-pg']

df_date[columns_to_shift] = df_date.groupby('link')[columns_to_shift].shift(-1)


#create LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(32))
lstm_model.add(Dense(1))


In [239]:
df_date.loc[df_date['playername']=='Sidney Crosby']


Unnamed: 0_level_0,gp,tp,ppg,gpg,apg,pmpg,+/-pg,playername,link,position_C,position_D,position_F,position_L,position_R,position_W
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2005-01-01,79.0,120.0,1.52,0.455696,1.063291,0.759494,0.126582,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2006-01-01,53.0,72.0,1.36,0.45283,0.90566,0.735849,0.339623,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2007-01-01,77.0,103.0,1.34,0.428571,0.909091,0.987013,0.038961,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2008-01-01,81.0,109.0,1.35,0.62963,0.716049,0.876543,0.185185,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2009-01-01,41.0,66.0,1.61,0.780488,0.829268,0.756098,0.487805,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2010-01-01,22.0,37.0,1.68,0.363636,1.318182,0.636364,0.681818,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2011-01-01,36.0,56.0,1.56,0.416667,1.138889,0.444444,0.722222,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2012-01-01,80.0,104.0,1.3,0.45,0.85,0.575,0.225,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2013-01-01,77.0,84.0,1.09,0.363636,0.727273,0.61039,0.064935,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
2014-01-01,80.0,85.0,1.06,0.45,0.6125,0.525,0.2375,Sidney Crosby,https://www.eliteprospects.com/player/6146/sid...,1,0,0,0,0,0
