In [0]:
"""
This notebook has two functions of note, predict and compute score:

(squad, candidates, captain, vice_captain, priorities) = predict(2019, 1)
compute_score(squad, candidates, captain, vice_captain, priorities, 2019, 1)

"""

In [0]:
HIDDEN_DIM, BATCH_SIZE, EPOCHS, LR, EMBEDDING_DIM, SEASON = 512, 512, 100, 1e-3, len(FIELDS) - 1, 2019
actual_scores, max_scores, single_week_scores = {}, {}, {}
previous_previous_squad, previous_squad, squad, saved_transfer = [], [], [], False

for ROUND in range(1, 30):
  SAVE_DIR = '/content/drive/My Drive/CIS 522/CIS-522-project/2019'
  save_path = '{0}/gru-{1}-{2}-{3}-{4}-{5}-{6}.pt'.format(SAVE_DIR, SEASON, ROUND, BATCH_SIZE, HIDDEN_DIM, EPOCHS, LR)
  model = GRUPredictor(EMBEDDING_DIM, HIDDEN_DIM).to(device)
  model.load_state_dict(torch.load(save_path, map_location=torch.device('cpu') ))
  model.eval()

  previous_previous_squad = previous_squad
  previous_squad = squad
  if len(previous_previous_squad) > 0 and len(previous_squad) > 0:
    num_transfers = len(set(previous_squad) - set(squad))
    saved_transfer = (num_transfers == 0) or (num_transfers == 1 and saved_transfer)

  (squad, candidates, captain, vice_captain, priorities) = \
  predict(SEASON, ROUND, model, previous_squad, saved_transfer)
  actual_score = compute_score(squad, candidates, captain, vice_captain, priorities, 
                  SEASON, ROUND, previous_squad, saved_transfer)
  actual_scores[ROUND] = actual_score
  (squad, candidates, captain, vice_captain, priorities) = best_gameweek_team(SEASON, ROUND)
  max_score = compute_score(squad, candidates, captain, vice_captain, priorities, SEASON, ROUND)
  max_scores[ROUND] = max_score
  (squad, candidates, captain, vice_captain, priorities) = predict(SEASON, ROUND, model=model)
  single_week_score = compute_score(squad, candidates, captain, vice_captain, priorities, SEASON, ROUND)
  single_week_scores[ROUND] = single_week_score

  print('Max: {0}, Single: {1}, Actual: {2}'.format(max_score, single_week_score, actual_score))


In [0]:
!pip install pulp

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from google.colab import drive
import os
import random
from torch.utils.tensorboard import SummaryWriter
import pulp
import sys
from functools import cmp_to_key

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
drive.mount('./drive')
repo_path = "./fpl_prediction/"
if not os.path.isdir(repo_path):
  !git clone --recurse-submodules https://github.com/SolomonAduolMaina/fpl_prediction

%load_ext tensorboard
DRIVE_DIR = "./drive/My Drive/CIS 522/CIS-522-project/"
ROOT_LOG_DIR = DRIVE_DIR + "logs/"
%tensorboard --logdir {ROOT_LOG_DIR.replace(" ", "\\ ")}

In [0]:
"""
Evaluates to (squad, candidates, captain, vice_captain) picked by model + optimization
to play in season SEASON and round ROUND, assuming that the squad from last week is
previous_squad, the squad from two_weeks ago is previous_previous_squad, and saved_transfer is
True if you have a saved transfer.
"""
def predict(SEASON, ROUND, model=None, previous_squad=[], saved_transer=False):
  HIDDEN_DIM, BATCH_SIZE, EPOCHS, LR, EMBEDDING_DIM = 512, 512, 100, 1e-3, len(FIELDS) - 1

  name_mapping = name_conversions()
  season, previous_week = (SEASON, ROUND - 1) if ROUND > 1 else (max([2016, SEASON - 1]), 38)
  players_data = get_players_data(season, previous_week, name_mapping)
  train_dataset = PlayerDataset(players_data, batch_size=BATCH_SIZE, embedding_dim=EMBEDDING_DIM)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True)

  if model is None:
    model = GRUPredictor(EMBEDDING_DIM, HIDDEN_DIM).to(device)
    optimizer = torch.optim.Adam(model.parameters(), LR)
    criterion = nn.MSELoss()
    log_path = '{0}/gru-{1}-{2}-{3}-{4}-{5}-{6}'.format(ROOT_LOG_DIR, SEASON, ROUND, BATCH_SIZE, HIDDEN_DIM, EPOCHS, LR)
    SAVE_DIR = '/content/drive/My Drive/CIS 522/CIS-522-project/2019'
    save_path = '{0}/gru-{1}-{2}-{3}-{4}-{5}-{6}.pt'.format(SAVE_DIR, SEASON, ROUND, BATCH_SIZE, HIDDEN_DIM, EPOCHS, LR)
    train_model(model, optimizer, criterion, train_dataloader, EPOCHS, log=True, save=True, log_path=log_path, save_path=save_path)

  ps_and_ts = positions_and_teams(SEASON, name_mapping)
  values = get_gameweek_data(SEASON, ROUND, ps_and_ts, name_mapping) # Are player values posted here before the deadline?
  
  rankings = {}
  for name in ps_and_ts: # All players registered this season
    position, team = ps_and_ts[name]
    value = 1001 if name not in values else values[name]['value'] # Optimizer excludes this player

    prediction = 0 # I prefer for this to be -infinity but pulp refuses to solve such
    if name in players_data:
      history = train_dataset.player_data(name).to(device)
      length = history.shape[0]
      prediction = model(history.view(length, 1, EMBEDDING_DIM), [length]).view(-1)
      
    data = { 'team' : team, 'position' : position, 'value': value, 'total_points' : prediction }
    rankings[name] = data

  (name_mapping, fifteen) = optimize(rankings, previous_squad, saved_transfer, cost=12) # magic number
  (squad, candidates, captain, vice_captain) = pick_team(rankings, name_mapping, fifteen, ps_and_ts)

  priorities = {}
  for player in set(squad):
    priorities[player] = float(rankings[player]['total_points'])

  return (squad, candidates, captain, vice_captain, priorities)


"""
Evaluates to (squad, candidates, captain, vice_captain) given predictions, which
is a dictionary whose keys are player_names and whose values contain the expected
points of the player, name_mapping which maps from the names of the linear programming variables in fifteen
to player_names - these two are expected to be the output of optimize - and 
positions_and_teams is expected to be the output of positions_and_teams.
"""
def pick_team(predictions, name_mapping, fifteen, positions_and_teams):
  squad = set([name_mapping[v.name] for v in fifteen if v.varValue != 0])
  required = [1, 3, 0, 1]
  squad_positions = { 1 : [], 2 : [], 3 : [], 4 : [] }

  # Rank players in each position
  for player in squad:
      squad_positions[positions_and_teams[player][0]].append((player, predictions[player]['total_points']))
  for position in squad_positions:
      squad_positions[position] = sorted(squad_positions[position], key=cmp_to_key(lambda x, y: x[1] - y[1]), reverse=True)

  # Fill positions that need filling
  candidates = []
  for position in squad_positions:
    needed = required[position - 1]
    candidates += squad_positions[position][:needed]
    squad_positions[position] = squad_positions[position][needed:]

  # Fill remaining spots
  remaining = []
  for position in squad_positions:
    if not position == 1: # We've already picked a goalkeeper
      remaining += squad_positions[position]
  remaining = sorted(remaining, key=cmp_to_key(lambda x, y: x[1] - y[1]), reverse=True)
  playing_size = len(set(candidates))

  while playing_size < 11 :
    candidates.append(remaining.pop(0))
    playing_size += 1
  
  # Pick captain and vice captain
  candidates = sorted(candidates, key=cmp_to_key(lambda x, y: x[1] - y[1]), reverse=True)
  captain = candidates[0][0]
  vice_captain = candidates[1][0]
  candidates = set([ player for (player, _) in candidates ])

  return (squad, set(candidates), captain, vice_captain)


"""
Evaluates to a number > 1 if x played and y didn't, or if they both
played or didn't play but x is ranked higher than y. Otherwise evaluates
to -1.
"""
def play_priority_sort(x, y):
  (_, played1, points1), (_, played2, points2) = x, y
  if played1 and not played2: return 1
  if played2 and not played1: return -1
  return points1 - points2

"""
Evaluates to True if player played in the gameweek whose results are reflected in
gameweek_data. Otherwise evaluates to True.
"""
def played(player, gameweek_data):
  return gameweek_data[player]['minutes'] > 0 or \
        gameweek_data[player]['yellow_cards'] > 0 or \
        gameweek_data[player]['red_cards'] > 0

"""
Evaluates to a number, the score for the team scored, captain's scored is doubled if they played,
otherwise vice_captain's score is doubled. If the transfers from previous_squad to squad
number to more than 1, subtract 4 * that number -1, since everyone gets one free transfer
per week. Also add 4 back if saved transfer is True in this case.
"""
def compute_score(squad, candidates, captain, vice_captain, priorities, 
                  SEASON, ROUND, previous_squad=[], saved_transfer=False):
  name_mapping = name_conversions()
  ps_and_ts = positions_and_teams(SEASON, name_mapping)
  gameweek_data = get_gameweek_data(SEASON, ROUND, ps_and_ts, name_mapping)
  scored = team_scored(squad, candidates, priorities, gameweek_data)
  
  points_earned = 0
  for player in scored:
      points_earned += gameweek_data[player]['total_points']

  if played(captain, gameweek_data):
    points_earned += gameweek_data[captain]['total_points']
  elif played(vice_captain, gameweek_data):
    points_earned += gameweek_data[vice_captain]['total_points']

  if len(previous_squad) == 0:
    return points_earned

  num_transfers = len(set(previous_squad) - set(squad))
  saved_transfer = (num_transfers == 0) or (num_transfers == 1 and saved_transfer)

  if num_transfers <= 1:
    return points_earned
  return points_earned - (4 * (num_transfers - (1 if saved_transfer else 0) - 1)) # -1 for free transfer


"""
Evaluates to an iterable of the team that actually played given squad of 15 and 11 candidates.
Some players might have to be substituted if they didn't play according to gameweek_data.
In this case use rankings[player]['total_points'] as substitution priorities.
"""
def team_scored(squad, candidates, priorities, gameweek_data):
  playing = [ player for player in candidates if played(player, gameweek_data) ]

  playing_size = len(set(playing))
  if playing_size < 11:
    # We'll play remaining in ascending order of their priorities
    squad = set(squad)
    dropped = set(candidates) - set(playing)
    substitutes = set(squad) - set(candidates)

    # Rank players according to their position
    remaining = { 1 : [], 2 : [], 3 : [], 4 : []}
    for player in (dropped.union(substitutes)):
      remaining[gameweek_data[player]['position']].append((player, played(player, gameweek_data), priorities[player]))
    for position in remaining:
      remaining[position] = sorted(remaining[position], key=cmp_to_key(play_priority_sort), reverse=True)

    # Compute the positions that need to be filled
    positions = [0, 0, 0, 0] # Goalkeeper, Defenders, Midfielders, Strikers
    for name in playing:
      positions[int(gameweek_data[name]['position']) - 1] += 1
    to_fill = [max(0, 1 - positions[0]), max(0, 3 - positions[1]), 0, max(0, 1 - positions[3])]

    # Fill the positions that need to be filled
    for position in range(len(positions)):
      still_need = to_fill[position]
      playing += [ player for (player, _, _) in remaining[position + 1][:still_need] ]
      remaining[position + 1] = remaining[position + 1][still_need:] if position != 0 else [] # Keeper already picked
    
    playing_size = len(set(playing))
    if playing_size < 11 : # Add remaining in ascending order till done
      last_batch = []
      for position in remaining:
        last_batch += remaining[position]
      last_batch = sorted(last_batch, key=cmp_to_key(lambda x, y: x[1] - y[1]), reverse=True)

      while playing_size < 11 :
        playing.append(last_batch.pop(0)[0])
        playing_size += 1

      """This is sufficient since if no more played the highest priority ones will
        be chosen, and these were candidates to begin with."""

  return set(playing)


"""
Evaluates to (squad, candidates, captain, vice_captain, priorities) for the best team
in season SEASON and round ROUND.
"""
def best_gameweek_team(SEASON, ROUND):
  name_mapping = name_conversions()
  ps_and_ts = positions_and_teams(SEASON, name_mapping)
  rankings = get_gameweek_data(SEASON, ROUND, ps_and_ts, name_mapping)
  (name_mapping, fifteen) = optimize(rankings)
  (squad, candidates, captain, vice_captain) = pick_team(rankings, name_mapping, fifteen, ps_and_ts)

  priorities = {}
  for player in set(squad):
    priorities[player] = float(rankings[player]['total_points'])

  return (squad, candidates, captain, vice_captain, priorities)

In [0]:
"""
Evaluates to (name_mapping, fifteen) where fifteen is the the set of all linear programming variables. 
optimized using predictions, assuming that last week's squad was previous_squad and
saved_transfer is True if we have a saved transfer at this point. cost will be optimized
away. name_mapping maps the name of each linear programming variable to the player associated
to that variable.

NOTE: We do not optimize for the eleven as this causes a maximum recursion depth error.
I believe there are just too many variables if we solve for that. But the problems are reasonably
similar.
"""
def optimize(predictions, previous_squad=[], saved_transfer=False, cost=0):
  fifteen = { name : (pulp.LpVariable(name, lowBound=0, upBound=1, cat="Integer"), name) for name in predictions }
  goal_keepers = { name : fifteen[name][0] for name in predictions if predictions[name]['position'] == 1}
  defenders = { name : fifteen[name][0] for name in predictions if predictions[name]['position'] == 2}
  mid_fielders = { name: fifteen[name][0] for name in predictions if predictions[name]['position'] == 3}
  strikers = { name : fifteen[name][0] for name in predictions if predictions[name]['position'] == 4}

  model = pulp.LpProblem("Fantasy Premier League", pulp.LpMaximize)

  # 2 Goalkeepers, 5 defenders, 5 mid_fields, 5 strikers in whole squad
  model += pulp.lpSum( [goal_keepers[name] for name in goal_keepers] ) == 2
  model += pulp.lpSum( [defenders[name] for name in defenders] ) == 5
  model += pulp.lpSum( [mid_fielders[name] for name in mid_fielders] ) == 5
  model += pulp.lpSum( [strikers[name] for name in strikers] ) == 3

  # Cost Cap
  model += pulp.lpSum( [fifteen[name][0] * predictions[name]['value'] for name in predictions] ) <= 1000

  # Only three players in the squad per team
  for team in set([predictions[name]['team'] for name in predictions]):
      team_members = { name : fifteen[name][0] for name in predictions if int(predictions[name]['team']) == int(team)}
      model += pulp.lpSum( [team_members[name] for name in team_members] ) <= 3
    
  # Maximize the squad score minus the transfer penalty. Can we do anything about the free transfer?
  model += pulp.lpSum( [fifteen[name][0] * predictions[name]['total_points'] for name in fifteen] ) \
          - (cost * pulp.lpSum( [fifteen[name][0] * (1 if name not in previous_squad else 0) for name in predictions] )) \
          + (cost if saved_transfer else 0)

  model.solve()

  name_mapping = { variable.name : name for (variable, name) in fifteen.values() }
  fifteen = [variable for (variable, _) in fifteen.values()]

  return (name_mapping, fifteen)

In [0]:
class GRUPredictor(torch.nn.Module):
  def __init__(self, embedding_dim, hidden_dim):
    super(GRUPredictor, self).__init__()
    self.hidden_dim = hidden_dim
    self.gru = nn.GRU(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, 1)

  def forward(self, features, lengths):
    packed_features = torch.nn.utils.rnn.pack_padded_sequence(features, lengths, enforce_sorted=False)
    _, hidden = self.gru(packed_features)
    return self.fc(hidden)


def train_model(model, optimizer, criterion, train_dataloader, EPOCHS, log_path="", save_path="", save=False, log=False):
  if log:
    writer = SummaryWriter(log_path)

  for epoch in range(EPOCHS):
    running_loss = 0.0
    items = 0

    for index, (features, lengths, points) in enumerate(train_dataloader):
      features = features.squeeze(0).to(device)
      lengths = lengths.view(-1).to(device)
      points = points.view(-1).to(device)

      model.zero_grad()
      pred = model(features, lengths).view(-1)
      loss = criterion(pred, points)
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      items = items + 1
    
    avg_loss = running_loss / items
    if log:
      print('Epoch {0}: average loss {1}'.format(epoch + 1, avg_loss))
      writer.add_scalar("Loss", avg_loss, epoch + 1)

  if save:
    torch.save(model.state_dict(), save_path)

In [0]:
FIELDS = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved', 'player', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat',  'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards']


"""
Some players have different names at different points of the data
e.g. Isaac Success and Isaac Success Ajayi. 

This function evaluates to a dictionary mapping different representations
to a single representation using name_conversions.csv, which is manually generated.
"""
def name_conversions():
  # Initialize the mapping of names in players_raw.csv that need to be translated
  name_mapping = {}
  player_mapping = pd.read_csv('./fpl_prediction/name_conversions.csv', encoding = "UTF-8")
  for row in player_mapping.itertuples():
      name_mapping[row.bad_name.lower()] = row.good_name.lower()
  
  return name_mapping


"""
Evaluates to a dictionary whose keys are player names and whose values
are the entire player's history up till season SEASON and round ROUND.
name_mapping is expected to be the result of name_conversions.
"""
def get_players_data(SEASON, ROUND, name_mapping):
  SEASON = { 2016 : 0, 2017 : 1, 2018 : 2, 2019 : 3 }[SEASON]
  directory_string = './fpl_prediction/Fantasy-Premier-League/data/20{0}-{1}/players/'
  players_data = {}
  players = {}
  index_count = 0

  for season in range(SEASON + 1):
      formatted_string = directory_string.format(season + 16, season + 16 + 1)
      directory = os.fsencode(formatted_string)

      for file in os.listdir(directory):
          filename = os.fsdecode(file)
          name = " ".join(filename.split('_')[:2]).lower()
          name = name_mapping[name] if name in name_mapping else name

          if name not in players:
              players[name] = index_count
              index_count = index_count + 1

          csv = pd.read_csv(formatted_string + filename + '/gw.csv', encoding = "UTF-8")
          csv = csv[csv['round'] <= ROUND] if season == SEASON else csv
          csv['round'] = 38 * season + csv['round']
          csv['player'] = pd.Series([players[name]] * len(csv))
          csv = csv[FIELDS]
          csv = csv.astype('float')

          if name not in players_data:
              players_data[name] = csv
          else:
              players_data[name] = pd.concat([players_data[name], csv])

  players_data = { name : df.drop_duplicates(subset=['round'], keep='last') for (name,df)  in players_data.items() if len(df) > 0}

  return players_data


"""
Evaluates to a dictionary whose keys are player names and whose values are
pairs containing the player's position and team for the season SEASON. Ideally we
would have these computed week-by-week but I don't think we have that data.
name_mapping is expected to be the output of name_conversions.
"""
def positions_and_teams(SEASON, name_mapping):
  SEASON = { 2016 : 0, 2017 : 1, 2018 : 2, 2019 : 3 }[SEASON]  
  directory_string = './fpl_prediction/Fantasy-Premier-League/data/20{0}-{1}/'
  formatted_string = directory_string.format(SEASON + 16, SEASON + 16 + 1)

  result={}
  csv = pd.read_csv(formatted_string + 'players_raw.csv', encoding = "UTF-8")
  for row in csv.itertuples():
      name = (row.first_name + ' ' + row.second_name).lower()
      name = name_mapping[name] if name in name_mapping else name
      position = row.element_type
      team_id = row.team_code
      result[name] = (position, team_id)
    
  return result


"""
Evaluates to a dictionary where the keys are the player names and the values
are dictionaries mapping attributes used for optimization and computing
team scores in gameweek ROUND of season SEASON. 
These attributes are ['round', 'value', 'total_points', 'minutes', 'yellow_cards', 'red_cards'].
positions_and_teams is expected to be the output of positions_and_teams(SEASON, name_mapping)
name_mapping is expected to be the output of name_conversions
"""
def get_gameweek_data(SEASON, ROUND, positions_and_teams, name_mapping):
  SEASON = { 2016 : 0, 2017 : 1, 2018 : 2, 2019 : 3 }[SEASON]  
  directory_string = './fpl_prediction/Fantasy-Premier-League/data/20{0}-{1}/players/'
  players_data = {}
  fields = ['round', 'value', 'total_points', 'minutes', 'yellow_cards', 'red_cards']

  # Fetch each player's performance for round ROUND and season SEASON
  formatted_string = directory_string.format(SEASON + 16, SEASON + 16 + 1)
  directory = os.fsencode(formatted_string)
  for file in os.listdir(directory):
      filename = os.fsdecode(file)
      name = " ".join(filename.split('_')[:2]).lower()
      name = name_mapping[name] if name in name_mapping else name
      csv = pd.read_csv(formatted_string + filename + '/gw.csv', encoding = "UTF-8")
      csv = csv[csv['round'] == ROUND]
      csv = csv[fields]
      csv = csv.astype('float')
      players_data[name] = csv
  
  players_data = { name : df.drop_duplicates(subset=['round'], keep='last') for (name,df) in players_data.items() if len(df) > 0}

  gameweek_data = {}
  for name in players_data:
    position = int(positions_and_teams[name][0])
    team = int(positions_and_teams[name][1])
    value = float(players_data[name]['value'])
    minutes = float(players_data[name]['minutes'])
    red_cards = float(players_data[name]['red_cards'])
    yellow_cards = float(players_data[name]['yellow_cards'])
    total_points = float(players_data[name]['total_points'])
    data = {'team' : team, 'position' : position, 'value': value, 'total_points' : total_points,
            'minutes' : minutes, 'yellow_cards' : yellow_cards, 'red_cards' : red_cards}
    gameweek_data[name] = data

  return gameweek_data


class PlayerDataset(torch.utils.data.Dataset):
  def __init__(self, players_data, batch_size, embedding_dim):
    self.batch_size = batch_size
    self.embedding_dim = embedding_dim

    all_data = pd.concat([players_data[name] for name in players_data])
    all_features = all_data.drop(['total_points'], axis=1).to_numpy()
    all_points = all_data.drop(all_data.columns.difference(['total_points']), axis=1).to_numpy()

    """Apparently no need to scale points since MSE is robust to scaling?"""
    feature_scaler = preprocessing.RobustScaler()
    scaled_features = feature_scaler.fit_transform(all_features)

    self.data = {}
    training_data = []
    end = 0

    # Save each player's history and points, and add it to training set. 
    # Also save entire player's history for prediction later on
    for name in players_data:
      history_length = len(players_data[name])

      for length in range(history_length - 1): # -1 because we have no prediction for the last point
        history = torch.Tensor(scaled_features[end : end + 1 + length, :])
        points = all_points[end + length + 1]
        training_data.append((history, points))

      new_end = end + history_length
      self.data[name] = torch.Tensor(scaled_features[end : new_end, :])
      end = new_end

    # Create the training batches
    random.shuffle(training_data)
    num_batches = len(training_data) // self.batch_size
    batches = [(k * self.batch_size, (k + 1) * self.batch_size) for k in range(num_batches)]
    batches.append((num_batches * self.batch_size, len(training_data)))

    self.batched_data = []
    for (start, end) in batches:
      if start != end:
        lengths = [len(features) for (features, _) in training_data[start : end]]
        three_d = torch.zeros((max(lengths), end - start, self.embedding_dim))

        total_points = []
        for index in range(start, end):
          features, points = training_data[index]
          three_d[: features.shape[0], index - start, : features.shape[1]] = features
          total_points.append(points)

        self.batched_data.append((three_d, torch.FloatTensor(lengths), torch.FloatTensor(total_points)))
    
  def __len__(self):
    return len(self.batched_data)

  def __getitem__(self, index):
    return self.batched_data[index]
    
  def player_data(self, name):
    return self.data[name]