<a href="https://colab.research.google.com/github/SeanBoyd13/Similarity-Weighted-Translation-Projection-Model/blob/main/code/colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import json

from datetime import datetime

all_translations = []


years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]

def calc_age(season, year, month, day):
    age = int(season)-int(year) + (1-((int(month)-9)/12 + (int(day)-15)/365)) - 1
    return round(age,2)

def get_primary_league(player, year):
    gp = 0
    for season in player.json()['seasonTotals']:
        if season['season'] == year and season['points'] > 0 and season['gamesPlayed'] > gp:
            league = season['leagueAbbrev']
            gp = season['gamesPlayed']
            g = season['goals']
            pts = season['points']
            ppg = pts/gp

    if gp == 0:
        return None, None, None, None, None
    else:
        return league, gp, g, pts, ppg

for draft_year in years:
    draft = requests.get('https://records.nhl.com/site/api/draft?include=draftProspect.id&cayenneExp=%20draftYear%20=%20'+str(draft_year))

    data = draft.json()

    for player in data['data']:
        position = player.get('position')
        pick = player.get('overallPickNumber')
        id = player.get('playerId')

        if (id is None) or (position == 'G'):
            continue

        if position != 'D':
            position = 'F'

        first = player.get('firstName')
        last = player.get('lastName')

        try:
            player = requests.get('https://api-web.nhle.com/v1/player/' + str(id) + '/landing')

        except:
            continue

        height = player.json()['heightInInches']
        weight = player.json()['weightInPounds']

        bd_year, bd_month, bd_day = player.json()['birthDate'].split('-')

        these_years = []
        checked_years = []

        start = draft_year - 3
        for i in range(15):
            y = start + i
            these_years.append(y)

        for j in range(len(these_years)):
            season = these_years[j]

            if season in checked_years:
                continue

            else:
                try:
                    league, gp, g, pts, ppg = get_primary_league(player, season*10000 + season + 1)
                    league2, gp2, g2, pts2, ppg2 = get_primary_league(player, season*10000 + season + 10002)

                    if league is None or league2 is None:
                        continue

                    age = calc_age(round(season), bd_year, bd_month, bd_day)

                    translation = round(ppg2/ppg,2)

                    print(f'{draft_year} ({pick}) -  {first} {last}')

                    for l in range(len(all_translations)+1):
                        try:
                            if all_translations[l][0][1] == league: #Check one translation in that set to see what the source league is, if same:
                                all_translations[l].append([first, last, position, league, league2, height, weight, age, gp, g, pts, round(ppg,2), translation, season])
                                checked_years.append(season)
                                break

                        except IndexError:
                            all_translations.append([[first, last, position, league, league2, height, weight, age, gp, g, pts, round(ppg, 2), translation, season]])
                            checked_years.append(season)
                            break

                except KeyError:
                    continue

def save_translations(all_translations, csv_file='all_translations_2015_to_2024.csv'):
  import csv
  with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['First', 'Last', 'Position', 'Source League', 'Target League', 'Height', 'Weight', 'Age', 'Games Played', 'Goals', 'Points', 'PPG', 'Translation', 'Year'])

    for translation_set in all_translations:
      for translation in translation_set:
        writer.writerow(translation)
  print(f'Saved translations to {csv_file}')

save_translations(all_translations) # - Last Saved: 2005-2014 Draft Data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
from google.colab import files
files.download('all_translations_2015_to_2024.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
from tempfile import tempdir
from google.colab import files

uploaded = files.upload()

def load_translations(csv_file='all_translations_2015_to_2024.csv'):
  import csv
  translations = []
  with open(csv_file, mode='r') as file:
    reader = csv.reader(file)
    next(reader)
    leagues_dict = {}

    for row in reader:
      first, last, position, league, league2, height, weight, age, gp, g, pts, ppg, translation, year = row
      record = [first, last, position, league, league2, int(height), int(weight), float(age), int(gp), int(g), int(pts), float(ppg), float(translation), int(year)]

      if league in leagues_dict:
        leagues_dict[league].append(record)

      else:
        leagues_dict[league] = [record]

    all_translations = list(leagues_dict.values())

  return all_translations

all_translations = load_translations()

Saving all_translations_2015_to_2024.csv to all_translations_2015_to_2024.csv


In [4]:
# From the dataset contained in the csv file, we select the subset of players of the same position who played in the same league.
# We use this to produce a dictionary of all leagues transferred into by ten or greater players, each league as a key and each value
# as the number of players who made this move. For non-NHL leagues, we check if this "intermediate" league has had ten or greater
# players transfer to the NHL. If it does, we simulate the player's season in said "intermediate" league using a similarity based
# translation factor. We repeat this similarity based translation process for the "intermediate" league to the NHL. The same is
# performed for direct to NHL translation, bypassing the intermediate league entirely. For paths which include an intermediate league
# an age back-tracing is performed to approximate the translation if the player had gone directly to the NHL and bypassed the
# intermediate league. All valid paths to an NHL approximate translation are then combined via a weighted average using the dictionary
# to construct an overall projected translation. Multiplying this value by the player's current points per game played in their current
# league played by 82 games (total in an NHL season) will produce an approximation for the players NHL point total in the suceeding season.


# Index:         0         1      2            3         4         5         6         7      8     9    10     11        12             13
# Player:       [{first}, {last}, {position}, {league},  None,     {height}, {weight}, {age}, {gp}, {g}, {pts}, {pts/gp}, {year}               ]
# Translation:  [{first}, {last}, {position}, {league}, {league2}, {height}, {weight}, {age}, {gp}, {g}, {pts}, {pts/gp}, {translation}, {year}]

from scipy.stats import percentileofscore

FORWARD_CONST_1 = -0.03114819361
FORWARD_CONST_2 = -0.0001900332976
FORWARD_CONST_3 = 1.803926041

DEFENSE_CONST_1 = -0.02790804243
DEFENSE_CONST_2 = -0.0001089869711
DEFENSE_CONST_3 = 1.701113596

def age_factor(position, age, height):
  # Returns an expected improvement factor (derived from a linear regression) filtered by position and based on age and height
    if position == 'F':
        return FORWARD_CONST_1 * age + FORWARD_CONST_2 * height + FORWARD_CONST_3
    elif position == 'D':
        return DEFENSE_CONST_1 * age + DEFENSE_CONST_2 * height + DEFENSE_CONST_3
    else:
        return None

def get_weighted_avg(player, succeeding, top_translations, feature):
  # Returns the weighted average of a specific feature favoring more similar player profiles
  try:
    for league_set in all_translations:
      sample_translation = league_set[0]
      if player[3] == sample_translation[3]:
        source_set = league_set

    neighbors = []
    for source_translation in source_set:
      if player[2] == source_translation[2] and (succeeding is None or (succeeding is not None and succeeding == source_translation[4])) and (source_translation[0]+source_translation[1] != player[0]+player[1]) and (source_translation[13] < (player[12]-1)):
        neighbors.append(source_translation)

    distances = []

    for neighbor in neighbors:
      sum = 0

      for i in range(7):
        weights = [5, 5, 5, 5, 5, 5, 5]
        all_feature = []

        for n in neighbors:
          if n == 2:
            all_feature.append(int(n[i+5]))
          else:
            all_feature.append(n[i+5])

        if i == 2:
          player_percentile = percentileofscore(all_feature, int(player[i+5]))
          neighbor_percentile = percentileofscore(all_feature, int(neighbor[i+5]))
          sum += abs(weights[i]*(player_percentile - neighbor_percentile)**2)

        else:
          player_percentile = percentileofscore(all_feature, player[i+5])
          neighbor_percentile = percentileofscore(all_feature, neighbor[i+5])
          sum += abs(weights[i]*(player_percentile - neighbor_percentile)**2)

      distances.append([neighbor, round(sum/204,2)])

    distances.sort(key=lambda x: x[1])

    verbose = False

    if verbose is True:
      if succeeding is not None:
        cutoff = 0
        print(f'{player[3]} > {succeeding}')
        for n in distances:
          cutoff += 1
          if cutoff > 3:
            break
          else:
            print(f'{cutoff}. {n[0]} - {n[1]} - {n[0][12]}')

    weighted_sum = 0
    weights = 0

    for n in range(top_translations):
      neighbor = distances[n][0]
      distance = distances[n][1]

      weighted_sum += neighbor[feature] * (1/(distance+0.00001))
      weights += (1/(distance+0.00001))

    return weighted_sum/weights
  except:
    return None


def predict_paths(player):
  # Returns the potential paths for a player next season in a dictionary with their historical frequency
  paths = {}

  try:
    for league_set in all_translations:
      sample_translation = league_set[0]
      if player[3] == sample_translation[3]:
        source_set = league_set
        break

    neighbors = []
    for source_translation in source_set:
      if player[2] == source_translation[2] and (source_translation[0]+source_translation[1] != player[0]+player[1]) and (source_translation[13] < (player[12]-1)):
          neighbors.append(source_translation)

  except:
    current = player[3]
    paths[current] = 1
    return paths

  for translation in neighbors:

    next_league = translation[4]

    if next_league in paths:
      paths[next_league] += 1
    else:
      paths[next_league] = 1

  sorted_paths = dict(sorted(paths.items(), key=lambda item: item[1], reverse=True))

  new_paths = {}

  for league in sorted_paths:
    if sorted_paths[league] >= 0:
      new_paths[league] = sorted_paths[league]

  return new_paths


def update_player(player, path):
  # Returns an updated player profile (and their translation factor) by simulating their next season in a specific league
  try:
    first, last, position, league, league2, height, weight, age, gp, g, pts, ppg, year = player

    update_paths = predict_paths(player)

    this_translation = get_weighted_avg(player, path, update_paths[path], 12)

    translated_ppg = round(ppg * this_translation,2)

    updated_player = [first, last, position, path, None, height, weight, age+1, 70, (this_translation * g/gp * 70), (translated_ppg * 70), translated_ppg, year+1]

    predicted_gp = get_weighted_avg(updated_player, None, update_paths[path], 8)

    predicted_g = this_translation * g / gp * predicted_gp
    predicted_pts = translated_ppg * predicted_gp

    updated_player[8] = int(predicted_gp)
    updated_player[9] = int(predicted_g)
    updated_player[10] = int(predicted_pts)

    return updated_player, this_translation

  except:
    return None, None

def project_translation(player):
  # Returns projected translation to the NHL
  chance_of_league = predict_paths(player)

  translations = {}
  for path in chance_of_league:

    if path == 'NHL':
      if chance_of_league['NHL'] >= 10:
        weighted_translation = get_weighted_avg(player, 'NHL', 10, 12)
      else:
        weighted_translation = get_weighted_avg(player, 'NHL', chance_of_league['NHL'], 12)

      if weighted_translation is None:
        continue

      else:
        return weighted_translation
        translations['NHL'] = weighted_translation

    else:
      updated_player, weighted_translation_to_intermediate = update_player(player, path)

      if updated_player is None:
        continue

      new_paths = predict_paths(updated_player)

      if 'NHL' in new_paths:
        if new_paths['NHL'] >= 10:
          weighted_translation_to_NHL = get_weighted_avg(updated_player, 'NHL', 10, 12)
        else:
          weighted_translation_to_NHL = get_weighted_avg(updated_player, 'NHL', new_paths['NHL'], 12)

        if weighted_translation_to_NHL is None:
          continue

        else:
          age_adjustment = (1/age_factor(player[2], player[7], player[5]))
          translations[path] = weighted_translation_to_intermediate * weighted_translation_to_NHL * age_adjustment

      else:
        continue

  try:
    w_sum = 0
    weights = 0
    for league in translations.keys():
      w_sum += translations[league] * chance_of_league[league]
      weights += chance_of_league[league]

    return w_sum / weights

  except:
    return 0


def project_NHL_points(player):
  # Projects NHL points in suceeding season (assuming 82 games played)
  translation = project_translation(player)
  return round(player[11] * translation * 82,1)



In [6]:
current_class = [
    # Input Player Profile Data in form:
    # ['First', 'Last', 'F/D', 'League Abbrev', None, Height (in), Weight (lbs), Age (as of start date of season), GP, Goals, Points, Points/GP, Season]

  ["Artyom","Levshunov","D","AHL",None,74,208,18.88,33,2,11,0.33,2024],["Beckett","Sennecke","F","OHL",None,75,190,18.63,37,28,64,1.73,2024],["Ivan","Demidov","F","KHL",None,73,192,18.76,45,16,37,0.82,2024],["Tij","Iginla","F","WHL",None,72,182,18.11,21,14,32,1.52,2024],["Carter","Yakemchuk","D","WHL",None,75,209,18.96,31,14,34,1.1,2024],["Berkly","Catton","F","WHL",None,71,170,18.67,33,20,62,1.88,2024],["Zayne","Parekh","D","OHL",None,72,179,18.58,36,16,51,1.42,2024],["Anton","Silayev","D","KHL",None,79,211,18.43,41,0,9,0.22,2024]
]

for prospect in current_class:
  print(prospect[0], prospect[1],", ",project_NHL_points(prospect))

Artyom Levshunov ,  24.9
Beckett Sennecke ,  30.8
Ivan Demidov ,  44.7
Tij Iginla ,  42.5
Carter Yakemchuk ,  18.2
Berkly Catton ,  48.1
Zayne Parekh ,  35.6
Anton Silayev ,  12.8
