In [0]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from google.colab import drive
import os
import random

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
drive.mount('/content/drive')
repo_path = "/content/fpl_prediction/"
if not os.path.isdir(repo_path):
  !git clone https://github.com/SolomonAduolMaina/fpl_prediction

In [0]:
def get_data(SEASON, ROUND):
  SEASON = { 2016 : 0, 2017 : 1, 2018 : 2, 2019 : 3 }[SEASON]
  actual = 38 * SEASON + ROUND - 1
  SEASON = actual // 38 + 1
  ROUND = actual % 38
  if ROUND == 0 and SEASON > 1:
    ROUND = 38
    SEASON = SEASON - 1
  
  directory_string = '/content/fpl_prediction/data/20{0}-{1}/players/'
  players_data = {}
  players = {}
  index_count = 0
  fields = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved', 'player', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat',  'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards']

  for season in range(0, SEASON):
      formatted_string = directory_string.format(season + 16, season + 16 + 1)
      directory = os.fsencode(formatted_string)

      for file in os.listdir(directory):
          filename = os.fsdecode(file)
          name = " ".join(filename.split('_')[:2])

          if name not in players:
              players[name] = index_count
              index_count = index_count + 1

          csv = pd.read_csv(formatted_string + filename + '/gw.csv', encoding = "UTF-8")
          csv = csv[csv['round'] <= ROUND]
          csv['round'] = 38 * season + csv['round']
          csv['player'] = pd.Series([players[name]] * len(csv))
          csv = csv[fields]
          csv = csv.astype('float')

          if name not in players_data:
              players_data[name] = csv
          else:
              players_data[name] = pd.concat([players_data[name], csv])

  players_data = { name : df.drop_duplicates(subset=['round'], keep='last') for (name,df)  in players_data.items() if len(players_data[name]) > 0}

  for name in players_data:
      players_data[name].to_csv('/content/fpl_prediction/players/{0}.csv'.format(name), index=False)

  name_mapping = {}
  player_mapping = pd.read_csv('/content/fpl_prediction/name_conversions.csv', encoding = "UTF-8")
  for row in player_mapping.itertuples():
      name_mapping[row.bad_name] = row.good_name

  directory_string = '/content/fpl_prediction/data/20{0}-{1}/'
  positions_and_teams={}
  for season in range(0, SEASON):
      formatted_string = directory_string.format(season + 16, season + 16 + 1)

      csv = pd.read_csv(formatted_string + 'players_raw.csv', encoding = "UTF-8")
      for row in csv.itertuples():
          name = row.first_name + ' ' + row.second_name
          name = name_mapping[name] if name in name_mapping else name

          position = row.element_type
          team_id = row.team_code
      
          if name not in positions_and_teams:
              positions_and_teams[name] = (position, [None] * 4)
          positions_and_teams[name][1][season] = team_id

  gameweek_data=[[pd.DataFrame(columns=['name', 'team', 'position', 'value', 'total_points']) for i in range(0, 38)] for i in range(0,4)]

  for name in players_data:
      for row in players_data[name].itertuples():
          convenient_round = int(row.round) - 1
          season = convenient_round // 38
          week = convenient_round % 38
          position = positions_and_teams[name][0]
          team = positions_and_teams[name][1][season]
          value = row.value
          total_points = row.total_points
          data = {'name' : name, 'team' : team, 'position' : position, 'value': value, 'total_points' : total_points}
          old_data = gameweek_data[season][week]
          gameweek_data[season][week] = old_data.append(data, ignore_index=True)

  return (players_data, gameweek_data)

In [0]:
class PlayerDataset(torch.utils.data.Dataset):
  def __init__(self, players_data, batch_size, embedding_dim):
    self.sorted_names = list(players_data)
    self.sorted_names.sort()
    self.batch_size = batch_size
    self.embedding_dim = embedding_dim

    all_data = pd.concat([players_data[name] for name in self.sorted_names])
    all_features = all_data.drop(['total_points'], axis=1).to_numpy()
    all_total_points = all_data.drop(all_data.columns.difference(['total_points']), axis=1).to_numpy()
    scaler = preprocessing.RobustScaler()
    scaled_features = scaler.fit_transform(all_features)

    end = 0
    self.players_data = {}
    training_data = []

    for name in self.sorted_names:
      number = len(players_data[name])

      for length in range(number - 1): # -1 because we have no prediction for the last row
        training_data.append((torch.Tensor(scaled_features[end : end + 1 + length, :]), all_total_points[end + length + 1]))

      new_end = end + number
      features = torch.Tensor(scaled_features[end : new_end, :])
      total_points = torch.Tensor(all_total_points[end : new_end, :])
      self.players_data[name] = (features, total_points)
      end = new_end

    random.shuffle(training_data)
    num_batches = len(training_data) // self.batch_size
    batches = [(k * self.batch_size, (k+1) * self.batch_size) for k in range(num_batches)]
    batches.append((num_batches * self.batch_size, len(training_data)))

    self.training_data = []
    for (start, end) in batches:
      if start != end:
        lengths = [len(features) for (features, _) in training_data[start : end]]
        length = end - start
        three_d = torch.zeros((max(lengths), length, self.embedding_dim))

        total_points = []
        for index in range(start, end):
          features, points = training_data[index]
          three_d[: features.shape[0], index - start, : features.shape[1]] = features
          total_points.append(points)

        self.training_data.append((three_d, torch.FloatTensor(lengths), torch.FloatTensor(total_points)))
    
  def __len__(self):
    return len(self.training_data)

  def __getitem__(self, index):
    return(self.training_data[index])
    
  def player_data(self, name):
    return self.players_data[name]


In [9]:
fields = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes', 'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved', 'player', 'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat',  'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'was_home', 'yellow_cards']

players_data, gameweek_data = get_data(2016, 38)
train_dataset = PlayerDataset(players_data, batch_size=128, embedding_dim=len(fields) - 1)
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True)



In [0]:
class GRUPredictor(torch.nn.Module):
 
  def __init__(self, embedding_dim, hidden_dim):
    super(GRUPredictor, self).__init__()
    self.hidden_dim = hidden_dim
    self.gru = nn.GRU(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, 1)

  def forward(self, features, lengths):
    packed_features = torch.nn.utils.rnn.pack_padded_sequence(features, lengths, enforce_sorted=False)
    _, hidden = self.gru(packed_features) # maybe use output for attention later
    return self.fc(hidden)

In [0]:
hidden_dim = 256
model = GRUPredictor(len(fields) - 1, hidden_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [12]:
graph = []
for epoch in range(1):
  running_loss = 0.0
  items = 0

  for index, (features, lengths, points) in enumerate(train_dataloader):
    features = features.squeeze(0).to(device)
    lengths = lengths.view(-1).to(device)
    points = points.view(-1).to(device)

    model.zero_grad()
    pred = model(features, lengths).view(-1)
    loss = criterion(pred, points)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    items = items + 1

  avg_loss = running_loss / items
  print('{0} done, average loss is {1}'.format(epoch + 1, avg_loss))
  graph.append(avg_loss)


1 done, average loss is 5.119764337820166


In [0]:
history = train_dataset.player_data('Mohamed Salah')[0][:-1]
actual = list(train_dataset.player_data('Mohamed Salah')[1][1:])
predicted = list(model(history.view(history.shape[0], 1, history.shape[1])))
print([(actual[i].item(), predicted[i].item()) for i in range(len(actual))])

In [0]:
plt.figure()
plt.plot(graph)
plt.show()