In [87]:
import requests
import pandas as pd
import plotly.express as px
import json
import pathlib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler as Scaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
import numpy as np

In [3]:
GAMEWEEK = 28

# Have to run this before they update the data each week
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
general_data = response.json()
players = general_data['elements']
players = [x for x in players if x['chance_of_playing_next_round'] == 100]
# players = [x for x in players if x['chance_of_playing_this_round'] != 0 and x['chance_of_playing_this_round'] != None]
players = [x['id'] for x in players]
print(f"{len(players)} players have a 100% chance to play in week {GAMEWEEK}")
print("")

url = "https://fantasy.premierleague.com/api/fixtures/"
response = requests.get(url)
fixture_data = response.json()
fixture_data = [x for x in fixture_data if x['event'] == GAMEWEEK]
print("1 is easy fixture, 5 is difficult fixture")
for i in range(len(fixture_data)):
    fixture_data[i]['team_a'] = general_data['teams'][fixture_data[i]['team_a']-1]['name']
    fixture_data[i]['team_h'] = general_data['teams'][fixture_data[i]['team_h']-1]['name']
    print(f"{fixture_data[i]['team_h']}, {fixture_data[i]['team_h_difficulty']} vs {fixture_data[i]['team_a']}, {fixture_data[i]['team_a_difficulty']}, {fixture_data[i]['kickoff_time']}")

294 players have a 100% chance to play in week 28

1 is easy fixture, 5 is difficult fixture
Man Utd, 2 vs Everton, 4, 2024-03-09T12:30:00Z
Bournemouth, 2 vs Sheffield Utd, 2, 2024-03-09T15:00:00Z
Crystal Palace, 2 vs Luton, 2, 2024-03-09T15:00:00Z
Wolves, 2 vs Fulham, 2, 2024-03-09T15:00:00Z
Arsenal, 2 vs Brentford, 5, 2024-03-09T17:30:00Z
Aston Villa, 3 vs Spurs, 4, 2024-03-10T13:00:00Z
Brighton, 2 vs Nott'm Forest, 3, 2024-03-10T14:00:00Z
West Ham, 2 vs Burnley, 3, 2024-03-10T14:00:00Z
Liverpool, 5 vs Man City, 5, 2024-03-10T15:45:00Z
Chelsea, 3 vs Newcastle, 3, 2024-03-11T20:00:00Z
Bournemouth, 2 vs Luton, 2, 2024-03-13T19:30:00Z


In [11]:
# https://fantasy.premierleague.com/api/element-summary/{player-id}/
# https://fantasy.premierleague.com/api/event/{GW}/live/

df = pd.DataFrame()
for i in range(1,28):
    url = f"https://fantasy.premierleague.com/api/event/{i}/live/"
    response = requests.get(url)
    live_data = response.json()
    live_data = live_data['elements']
    live_df = pd.DataFrame(live_data)
    live_df = live_df[['stats']]
    live_df = pd.json_normalize(live_df['stats']).reset_index()
    live_df = live_df[live_df['index']==355]
    df = pd.concat([df,live_df])

In [20]:
url = f"https://fantasy.premierleague.com/api/element-summary/{355}/"
response = requests.get(url)
summary_data = response.json()
summary_data = summary_data['history']
summary_data

[{'element': 355,
  'fixture': 1,
  'opponent_team': 6,
  'total_points': 13,
  'was_home': False,
  'kickoff_time': '2023-08-11T19:00:00Z',
  'team_h_score': 0,
  'team_a_score': 3,
  'round': 1,
  'minutes': 79,
  'goals_scored': 2,
  'assists': 0,
  'clean_sheets': 1,
  'goals_conceded': 0,
  'own_goals': 0,
  'penalties_saved': 0,
  'penalties_missed': 0,
  'yellow_cards': 0,
  'red_cards': 0,
  'saves': 0,
  'bonus': 3,
  'bps': 56,
  'influence': '67.6',
  'creativity': '23.7',
  'threat': '48.0',
  'ict_index': '13.9',
  'starts': 1,
  'expected_goals': '0.60',
  'expected_assists': '0.05',
  'expected_goal_involvements': '0.65',
  'expected_goals_conceded': '0.23',
  'value': 140,
  'transfers_balance': 0,
  'selected': 7200159,
  'transfers_in': 0,
  'transfers_out': 0},
 {'element': 355,
  'fixture': 16,
  'opponent_team': 15,
  'total_points': 2,
  'was_home': True,
  'kickoff_time': '2023-08-19T19:00:00Z',
  'team_h_score': 1,
  'team_a_score': 0,
  'round': 2,
  'minutes':

In [85]:
gw_data = pathlib.Path.cwd() / 'data' / '2023-24' / 'gws' / 'merged_gw.csv'
df = pd.read_csv(gw_data)

# filter by minutes
df = df[df['minutes'] > 0].reset_index(drop=True)

features = ['position_encoded', 'total_points',
            # home away factors
            'was_home', 'diff_ratio',
            # expected points
            'xP', 'avg_xP', 'sd_xP',
            # historical point factors
            'avg_ppm', 'sd_ppm',
            'avg_ppg', 'sd_ppg',
            # historical goal factors
            'avg_goals', 'sd_goals',
            # historical assist factors
            'avg_assists', 'sd_assists',
            # historical clean sheet factors
            'avg_cs', 'sd_cs',
            # historical goal conceded factors
            'avg_gc', 'sd_gc',
            # historical penalty save factors
            'avg_ps', 'sd_ps',
            # historical penalty miss factors
            'avg_pm', 'sd_pm',
            # historical save factors
            'avg_saves', 'sd_saves',
            # ict index factors
            'avg_influence', 'avg_creativity', 'avg_threat', 'avg_ict_index',
            'sd_influence', 'sd_creativity', 'sd_threat', 'sd_ict_index',
            # forecasted factors
            'avg_expected_assists', 'avg_expected_goal_involvements', 'avg_expected_goals', 'avg_expected_goals_conceded',
            'sd_expected_assists', 'sd_expected_goal_involvements', 'sd_expected_goals', 'sd_expected_goals_conceded',
            # opponent score
            'opp_m', 'opp_f', 'opp_g', 'opp_d'
            ]

# create a difficulty ratio
fixture_data = pathlib.Path.cwd() / 'data' / '2023-24' / 'fixtures.csv'
fix_df = pd.read_csv(fixture_data)
df = pd.merge(df, fix_df[fix_df['finished']][['id', 'team_h_difficulty', 'team_a_difficulty','team_a', 'team_h']], how='left', left_on='fixture', right_on='id')
df['diff_ratio'] = df.apply(lambda x: x['team_h_difficulty']/x['team_a_difficulty'] if x['was_home'] else x['team_a_difficulty']/x['team_h_difficulty'], axis=1)
teams_data = pathlib.Path.cwd() / 'data' / '2023-24' / 'teams.csv'
teams_df = pd.read_csv(teams_data)
df = pd.merge(df, teams_df[['id', 'name']], how='left', left_on='team_a', right_on='id', suffixes=('', '_away'))
df = pd.merge(df, teams_df[['id', 'name']], how='left', left_on='team_h', right_on='id', suffixes=('', '_home'))
df['opponent'] = df.apply(lambda x: x['name_away'] if x['was_home'] else x['name_home'], axis=1)
df['team_name'] = df.apply(lambda x: x['name_home'] if x['was_home'] else x['name_away'], axis=1)

# create points per minute
df['ppm'] = df['total_points'] / df['minutes']
df['ppm'].fillna(0, inplace=True)

# start calculating mean and std dev of previous games
window_number = 5
df = df.sort_values(by=['name', 'GW'])
df['roll_xP'] = df.groupby('name')['xP'].apply(lambda x: x.rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_xP'] = df.groupby('name')['xP'].apply(lambda x: x.rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_xP'] = df.groupby('name')['xP'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_xP'] = df.groupby('name')['xP'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_ppm'] = df.groupby('name')['ppm'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_ppm'] = df.groupby('name')['ppm'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_ppm'] = df.groupby('name')['ppm'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_ppm'] = df.groupby('name')['ppm'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_ppg'] = df.groupby('name')['total_points'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_ppg'] = df.groupby('name')['total_points'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_ppg'] = df.groupby('name')['total_points'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_ppg'] = df.groupby('name')['total_points'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_goals'] = df.groupby('name')['goals_scored'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_goals'] = df.groupby('name')['goals_scored'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_goals'] = df.groupby('name')['goals_scored'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_goals'] = df.groupby('name')['goals_scored'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_assists'] = df.groupby('name')['assists'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_assists'] = df.groupby('name')['assists'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_assists'] = df.groupby('name')['assists'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_assists'] = df.groupby('name')['assists'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_cs'] = df.groupby('name')['clean_sheets'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_cs'] = df.groupby('name')['clean_sheets'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_cs'] = df.groupby('name')['clean_sheets'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_cs'] = df.groupby('name')['clean_sheets'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)

df['roll_gc'] = df.groupby('name')['goals_conceded'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_gc'] = df.groupby('name')['goals_conceded'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_gc'] = df.groupby('name')['goals_conceded'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_gc'] = df.groupby('name')['goals_conceded'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_ps'] = df.groupby('name')['penalties_saved'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_ps'] = df.groupby('name')['penalties_saved'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_ps'] = df.groupby('name')['penalties_saved'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_ps'] = df.groupby('name')['penalties_saved'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_pm'] = df.groupby('name')['penalties_missed'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_pm'] = df.groupby('name')['penalties_missed'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_pm'] = df.groupby('name')['penalties_missed'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_pm'] = df.groupby('name')['penalties_missed'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_saves'] = df.groupby('name')['saves'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_saves'] = df.groupby('name')['saves'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_saves'] = df.groupby('name')['saves'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_saves'] = df.groupby('name')['saves'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)

df['roll_influence'] = df.groupby('name')['influence'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_influence'] = df.groupby('name')['influence'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_influence'] = df.groupby('name')['influence'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_influence'] = df.groupby('name')['influence'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_creativity'] = df.groupby('name')['creativity'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_creativity'] = df.groupby('name')['creativity'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_creativity'] = df.groupby('name')['creativity'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_creativity'] = df.groupby('name')['creativity'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_threat'] = df.groupby('name')['threat'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_threat'] = df.groupby('name')['threat'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_threat'] = df.groupby('name')['threat'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_threat'] = df.groupby('name')['threat'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_ict_index'] = df.groupby('name')['ict_index'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_ict_index'] = df.groupby('name')['ict_index'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_ict_index'] = df.groupby('name')['ict_index'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_ict_index'] = df.groupby('name')['ict_index'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)

df['roll_expected_assists'] = df.groupby('name')['expected_assists'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_expected_assists'] = df.groupby('name')['expected_assists'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_expected_assists'] = df.groupby('name')['expected_assists'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_expected_assists'] = df.groupby('name')['expected_assists'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_expected_goal_involvements'] = df.groupby('name')['expected_goal_involvements'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_expected_goal_involvements'] = df.groupby('name')['expected_goal_involvements'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_expected_goal_involvements'] = df.groupby('name')['expected_goal_involvements'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_expected_goal_involvements'] = df.groupby('name')['expected_goal_involvements'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_expected_goals'] = df.groupby('name')['expected_goals'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_expected_goals'] = df.groupby('name')['expected_goals'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_expected_goals'] = df.groupby('name')['expected_goals'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_expected_goals'] = df.groupby('name')['expected_goals'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)
df['roll_expected_goals_conceded'] = df.groupby('name')['expected_goals_conceded'].apply(lambda x: x.shift().rolling(window=window_number).mean()).reset_index(level=0, drop=True)
df['roll_sd_expected_goals_conceded'] = df.groupby('name')['expected_goals_conceded'].apply(lambda x: x.shift().rolling(window=window_number).std()).reset_index(level=0, drop=True)
df['avg_expected_goals_conceded'] = df.groupby('name')['expected_goals_conceded'].apply(lambda x: x.shift().expanding().mean()).reset_index(level=0, drop=True)
df['sd_expected_goals_conceded'] = df.groupby('name')['expected_goals_conceded'].apply(lambda x: x.shift().expanding().std()).reset_index(level=0, drop=True)

# label encoder
le = LabelEncoder()
df['position_encoded'] = le.fit_transform(df['position'])
df = df[df['round'] > 10]

weekly_data = df.groupby(['position', 'round', 'team_name'])['avg_ppg'].transform(lambda x: x.nlargest(2).mean()).reset_index()
weekly_data = df.pivot_table(index=['round', 'team_name'], columns='position', values='avg_ppg').reset_index()
df = pd.merge(df, weekly_data, how='left', left_on=['round', 'opponent'], right_on=['round', 'team_name'])
df.rename(columns={'FWD': 'opp_f', 'MID': 'opp_m', 'DEF': 'opp_d', 'GK': 'opp_g'}, inplace=True)

df = df[features]
# df = df.fillna(0, inplace=True)
print(df.describe())
print('')

pos = df['position_encoded'].unique()
corrs = {}
for p in pos:
    print(f'Position: {le.inverse_transform([p])[0]}')
    print(df[df['position_encoded'] == p].corr()['total_points'].sort_values(ascending=False))
    corrs[p] = df[df['position_encoded'] == p].corr()['total_points'].sort_values(ascending=False).index[1:5]

print(corrs)


       position_encoded  total_points   diff_ratio           xP       avg_xP  \
count       4753.000000   4753.000000  4753.000000  4753.000000  4668.000000   
mean           1.662739      2.777614     1.140504     2.495329     2.351624   
std            1.354035      3.023405     0.586577     2.159447     1.427646   
min            0.000000     -4.000000     0.400000    -1.800000    -1.300000   
25%            0.000000      1.000000     0.666667     1.000000     1.300000   
50%            2.000000      2.000000     1.000000     2.000000     2.190097   
75%            3.000000      3.000000     1.500000     3.500000     3.254972   
max            3.000000     22.000000     2.500000    16.700000    10.300000   

             sd_xP      avg_ppm       sd_ppm      avg_ppg       sd_ppg  ...  \
count  4579.000000  4668.000000  4579.000000  4668.000000  4579.000000  ...   
mean      1.332232     0.092609     0.120414     2.714886     2.460759  ...   
std       0.665353     0.131403     0.1763

In [88]:

print('Random Forest Regressor')
for p in pos:
    print(f'Position: {le.inverse_transform([p])[0]}')
    X = df[df['position_encoded'] == p].drop(columns=['total_points'])
    X = X[corrs[p]]
    y = df[df['position_encoded'] == p]['total_points']
    print(f'Average Total Points: {y.mean()}')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = Scaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    models[le.inverse_transform([p])[0]] = model

    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'R2: {model.score(X_test, y_test)}')
    print(f'Feature Names: {X.columns}')
    importance_str = ', '.join(f'{importance:.2f}' for importance in model.feature_importances_)
    print(f'Feature Importance: {importance_str}')
    print('')

Random Forest Regressor
Position: DEF
Average Total Points: 2.3993788819875776
MSE: 6.174858116514644
RMSE: 2.484926179288762
R2: 0.26077233879734163
Feature Names: Index(['xP', 'avg_xP', 'avg_ict_index', 'avg_ppg'], dtype='object')
Feature Importance: 0.45, 0.16, 0.15, 0.23

Position: GK
Average Total Points: 3.1545454545454548
MSE: 5.8331574077861985
RMSE: 2.4151930373753148
R2: -0.07243631740668888
Feature Names: Index(['xP', 'sd_ps', 'avg_ps', 'avg_threat'], dtype='object')
Feature Importance: 0.77, 0.12, 0.10, 0.01

Position: MID
Average Total Points: 2.88803611738149
MSE: 6.501250287501343
RMSE: 2.5497549465588536
R2: 0.28760895404037434
Feature Names: Index(['xP', 'avg_ict_index', 'avg_expected_goal_involvements', 'avg_xP'], dtype='object')
Feature Importance: 0.48, 0.17, 0.16, 0.19

Position: FWD
Average Total Points: 3.178929765886288
MSE: 6.315335110648148
RMSE: 2.5130330500509035
R2: 0.19429414470206396
Feature Names: Index(['xP', 'avg_ict_index', 'avg_threat', 'avg_creativi