# Current Data

In [64]:
import requests
import pandas as pd
import json
import plotly.express as px

## Todo list
- Only data from current season for now, model is limited at the start of a new season -> a new model has to be developed that uses weighted historical data
- Just use points per minute against cost for now
- Ignores current match fixture difficulty for now
- Evaluates players that have more than 0% chance to play for now -> there should be a classification model to determine who coaches will pick in the starting line up
- Investigate data anomalies that can skew calculations
- Come up with accuracy prediction evaluations each week
- Consider set piece takers

In [249]:
GAMEWEEK = 26

# Have to run this before they update the data each week
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
general_data = response.json()
players = general_data['elements']
# players = [x for x in players if x['chance_of_playing_next_round'] != 0 and x['chance_of_playing_next_round'] != None]
players = [x for x in players if x['chance_of_playing_this_round'] != 0 and x['chance_of_playing_this_round'] != None]
players = [x['id'] for x in players]
print(f"{len(players)} players have a chance to play in week {GAMEWEEK}")
print("")

url = "https://fantasy.premierleague.com/api/fixtures/"
response = requests.get(url)
fixture_data = response.json()
fixture_data = [x for x in fixture_data if x['event'] == GAMEWEEK]
print("1 is easy fixture, 5 is difficult fixture")
for i in range(len(fixture_data)):
    fixture_data[i]['team_a'] = general_data['teams'][fixture_data[i]['team_a']-1]['name']
    fixture_data[i]['team_h'] = general_data['teams'][fixture_data[i]['team_h']-1]['name']
    print(f"{fixture_data[i]['team_h']}, {fixture_data[i]['team_h_difficulty']} vs {fixture_data[i]['team_a']}, {fixture_data[i]['team_a_difficulty']}, {fixture_data[i]['kickoff_time']}")

319 players have a chance to play in week 26

1 is easy fixture, 5 is difficult fixture
Aston Villa, 2 vs Nott'm Forest, 4, 2024-02-24T15:00:00Z
Brighton, 2 vs Everton, 3, 2024-02-24T15:00:00Z
Crystal Palace, 2 vs Burnley, 2, 2024-02-24T15:00:00Z
Man Utd, 2 vs Fulham, 4, 2024-02-24T15:00:00Z
Bournemouth, 5 vs Man City, 2, 2024-02-24T17:30:00Z
Arsenal, 3 vs Newcastle, 4, 2024-02-24T20:00:00Z
Wolves, 2 vs Sheffield Utd, 2, 2024-02-25T13:30:00Z
West Ham, 3 vs Brentford, 3, 2024-02-26T20:00:00Z


In [250]:
# https://fantasy.premierleague.com/api/element-summary/{player-id}/
# https://fantasy.premierleague.com/api/event/{GW}/live/

data = pd.DataFrame()
for i in range(GAMEWEEK-1):
    url = f"https://fantasy.premierleague.com/api/event/{i+1}/live/"
    response = requests.get(url)
    live_data = response.json()
    live_data = live_data['elements']
    live_df = pd.DataFrame(live_data)
    live_df = live_df[['stats']]
    live_df = pd.json_normalize(live_df['stats']).reset_index()
    live_df = live_df[live_df['index'].isin(players)]
    data = pd.concat([data, live_df], ignore_index=True)

print(data.head())

   index  minutes  goals_scored  assists  clean_sheets  goals_conceded  \
0      3        0             0        0             0               0   
1      4        4             0        0             0               0   
2      5       90             0        0             0               1   
3      6        0             0        0             0               0   
4      8        0             0        0             0               0   

   own_goals  penalties_saved  penalties_missed  yellow_cards  ...  \
0          0                0                 0             0  ...   
1          0                0                 0             0  ...   
2          0                0                 0             0  ...   
3          0                0                 0             0  ...   
4          0                0                 0             0  ...   

   creativity  threat  ict_index  starts expected_goals expected_assists  \
0         0.0     0.0        0.0       0           0.00   

In [251]:
eval = data.groupby(['index'])[['minutes', 'total_points']].sum()
# select players who at least have a historical record
eval = eval[eval['minutes'] > 0]
# select players who at least are positive
eval = eval[eval['total_points'] > 0]
print(f"We will pick from a pool of {len(eval)}")

# for each index, get the player name
for i in eval.index:
    # search the id in general data and pull the name
    for j in range(len(general_data['elements'])):
        if general_data['elements'][j]['id'] == i:
            eval.loc[i, 'ppg'] = general_data['elements'][j]['points_per_game']
            eval.loc[i, 'cost'] = general_data['elements'][j]['now_cost']
            eval.loc[i, 'first_name'] = general_data['elements'][j]['first_name']
            eval.loc[i, 'second_name'] = general_data['elements'][j]['second_name']
            eval.loc[i, 'team'] = general_data['teams'][general_data['elements'][j]['team']-1]['name']
            eval.loc[i, 'position'] = general_data['elements'][j]['element_type']
            eval.loc[i, 'chance'] = general_data['elements'][j]['chance_of_playing_this_round']
            eval.loc[i, 'news'] = general_data['elements'][j]['news']
            eval.loc[i, 'news_added'] = general_data['elements'][j]['news_added']
            break

eval['name'] = eval['first_name'] + " " + eval['second_name']
eval['ppm'] = eval['total_points'] / eval['minutes'] * eval['chance']
eval['ppmcost'] = eval['ppm'] / eval['cost']
eval['ppg'] = eval['ppg'].astype(float)
eval['ppg'] = eval['ppg'] * eval['chance']
eval = eval.sort_values(by='ppmcost', ascending=False)
eval['position'] = eval['position'].map({1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'})

We will pick from a pool of 216


In [252]:
# select players who play alot for more consistency, try to aim for 50 players to select from
# create a deep copy of eval
ppmcost = eval.copy(deep=True)
ppmcost = ppmcost[ppmcost['minutes'] > 1450]
ppmcost = ppmcost[ppmcost['ppmcost'] > 0.05]
print(f"{len(ppmcost)} players have played more than 1250 minutes and selected for ppmcost algo")

fig = px.scatter(ppmcost, x='minutes', y='ppmcost', hover_data=[
                 'name', 'team', 'position'], title='Minutes vs PPMCost', color='position')
fig.show()

# We can only max 3 players from each team
team_count = {'Spurs': 0,
              "Nott'm Forest": 0,
              'Liverpool': 0,
              'Bournemouth': 0,
              'Aston Villa': 0,
              'Man City': 0,
              'Arsenal': 0,
              'Crystal Palace': 0,
              'Man Utd': 0,
              'Fulham': 0,
              'Luton': 0,
              'Everton': 0,
              'Brentford': 0,
              'Newcastle': 0,
              'Wolves': 0,
              'West Ham': 0,
              'Chelsea': 0,
              'Sheffield Utd': 0,
              'Brighton': 0,
              'Burnley': 0, }

team_count = dict(sorted(team_count.items()))

# ppmcost algo
# select 11 players from eval
# first select the top GK
# then select the top FWD
# then select the top 3 DEF
# then select 6 more players up to 2 more DEF, 5 MID and 2 more FWD

# select the top GK
gk = ppmcost[ppmcost['position'] == 'GK'].sort_values(
    by='ppmcost', ascending=False).head(1)
ppmcost = ppmcost[~ppmcost.index.isin(gk.index)]
gk_count = 1
team_count[gk['team'].values[0]] += 1

# select the top FWD
fwd = ppmcost[ppmcost['position'] == 'FWD'].sort_values(
    by='ppmcost', ascending=False).head(1)
fwd_count = 1
team_count[fwd['team'].values[0]] += 1
ppmcost = ppmcost[~ppmcost.index.isin(fwd.index)]

selected_11 = pd.concat([gk, fwd])

# select the top 3 DEF
defe = ppmcost[ppmcost['position'] == 'DEF'].sort_values(
    by='ppmcost', ascending=False)
def_count = 0
for i in defe.index:
    # check if any team has more than 3 players
    if team_count[defe.loc[i, 'team']] > 3:
        continue
    else:
        selected_11 = pd.concat(
            [selected_11, defe.loc[i, :].to_frame().T])
        def_count += 1
        team_count[defe.loc[i, 'team']] += 1
        ppmcost = ppmcost[~ppmcost.index.isin([i])]
        if def_count == 3:
            break
        else:
            continue

mid_count = 0

def select_6():
    global ppmcost, gk_count, def_count, fwd_count, mid_count, team_count
    ppmcost = ppmcost.sort_values(by='ppmcost', ascending=False)
    selected_6 = pd.DataFrame()
    for i in ppmcost.index:
        if ppmcost.loc[i, 'position'] == 'DEF' and def_count < 5:
            # concat the player row to the selected_6 dataframe
            selected_6 = pd.concat(
                [selected_6, ppmcost.loc[i, :].to_frame().T])
            def_count += 1
            team_count[ppmcost.loc[i, 'team']] += 1
        elif ppmcost.loc[i, 'position'] == 'MID' and mid_count < 5:
            selected_6 = pd.concat(
                [selected_6, ppmcost.loc[i, :].to_frame().T])
            mid_count += 1
            team_count[ppmcost.loc[i, 'team']] += 1
        elif ppmcost.loc[i, 'position'] == 'FWD' and fwd_count < 3:
            selected_6 = pd.concat(
                [selected_6, ppmcost.loc[i, :].to_frame().T])
            fwd_count += 1
            team_count[ppmcost.loc[i, 'team']] += 1
        elif ppmcost.loc[i, 'position'] == 'GK' and gk_count < 1:
            selected_6 = pd.concat(
                [selected_6, ppmcost.loc[i, :].to_frame().T])
            gk_count += 1
            team_count[ppmcost.loc[i, 'team']] += 1
        else:
            continue
        if gk_count + def_count + fwd_count + mid_count == 11:
            print(f"Final Selection: {gk_count} GK, {
                  def_count} DEF, {fwd_count} FWD, {mid_count} MID")
            print("")
            return selected_6


selected_6 = select_6()
selected_11 = pd.concat([selected_6, selected_11])

# Assume 4m from bench of 4 players
BUDGET = 840
print(team_count)
print("")
print(f"Current budget spent: {selected_11['cost'].sum()}, remaining budget: {
      BUDGET - selected_11['cost'].sum()}")
print("")
print(selected_11)

52 players have played more than 1250 minutes and selected for ppmcost algo


Final Selection: 1 GK, 5 DEF, 2 FWD, 3 MID

{'Arsenal': 1, 'Aston Villa': 1, 'Bournemouth': 2, 'Brentford': 0, 'Brighton': 0, 'Burnley': 0, 'Chelsea': 0, 'Crystal Palace': 1, 'Everton': 0, 'Fulham': 1, 'Liverpool': 2, 'Luton': 0, 'Man City': 2, 'Man Utd': 0, 'Newcastle': 0, "Nott'm Forest": 0, 'Sheffield Utd': 0, 'Spurs': 1, 'West Ham': 0, 'Wolves': 0}

Current budget spent: 547.0, remaining budget: 293.0

    minutes total_points    ppg  cost first_name         second_name  \
515    1808          142  110.0  45.0     Oliver               Skipp   
302    1652           97  240.0  49.0   Ibrahima              Konaté   
342    2082          123  370.0  51.0     Nathan                 Aké   
615    1463           78  170.0  48.0    Matheus  França de Oliveira   
662    2132          107  130.0  49.0      Adama              Traoré   
82     1565           67  290.0  45.0    Antoine             Semenyo   
352    2040          132  340.0  55.0    Ederson   Santana de Moraes   
13     1959   

In [253]:
# If there is still budget, excecute the maximise pointsperminute algo

# Maximise points per minute
# Sort Eval by ppm
# Substitute the lowest ppmcost player with the highest ppm player if it already does not exist
# If the player already exists, then substitute with the next highest ppm player
ppm = eval.copy(deep=True)
ppm = ppm[ppm['minutes'] > 1450]
ppm = ppm.sort_values(by=['ppm','ppmcost'], ascending=False)
ppm = ppm[~ppm.index.isin(selected_11.index)]
print(f"{len(ppm)} players have played more than 1450 minutes and selected for ppm algo")

def substitute_for_points():
    global ppm, selected_11, team_count, gk_count, def_count, fwd_count, mid_count
    selected_11 = selected_11.sort_values(by=['ppm', 'ppmcost'], ascending=True)
    # print(selected_11)

    # Substitute the lowest ppm player
    for i in selected_11.index:
        if selected_11.loc[i, 'position'] == 'GK':
            gk_count -= 1
        elif selected_11.loc[i, 'position'] == 'DEF':
            def_count -= 1
        elif selected_11.loc[i, 'position'] == 'MID':
            mid_count -= 1
        elif selected_11.loc[i, 'position'] == 'FWD':
            fwd_count -= 1
        # remove the count from team_count
        team_count[selected_11.loc[i, 'team']] -= 1
        # print(f"Substituting: {selected_11.loc[i, ['name','ppm']]}")

        # Find the highest possible ppm player
        for j in ppm.index:
            sub = False
            if selected_11.loc[i, 'ppm'] > ppm.loc[j, 'ppm']:
                pass
            elif team_count[ppm.loc[j, 'team']] > 2:
                pass
            elif ppm.loc[j, 'cost'] + selected_11['cost'].sum() > BUDGET+20:
                pass
            elif ppm.loc[j, 'position'] == 'GK' and gk_count > 0:
                pass
            elif ppm.loc[j, 'position'] == 'DEF' and def_count > 4:
                pass
            elif ppm.loc[j, 'position'] == 'MID' and mid_count > 4:
                pass
            elif ppm.loc[j, 'position'] == 'FWD' and fwd_count > 2:
                pass
            else:
                # remove the player from selected_11
                sub = True
                print(f"Substituted: {selected_11.loc[i,['name','ppm']].values} for  {ppm.loc[j, ['name','ppm']].values}")
                selected_11 = selected_11[~selected_11.index.isin([i])]
                selected_11 = pd.concat([selected_11, ppm.loc[j, :].to_frame().T])
                selected_11 = selected_11.sort_values(by=['ppm', 'ppmcost'], ascending=True)
                team_count[ppm.loc[j, 'team']] += 1
                if ppm.loc[j, 'position'] == 'GK':
                    gk_count += 1
                elif ppm.loc[j, 'position'] == 'DEF':
                    def_count += 1
                elif ppm.loc[j, 'position'] == 'MID':
                    mid_count += 1
                elif ppm.loc[j, 'position'] == 'FWD':
                    fwd_count += 1
                ppm = ppm[~ppm.index.isin([j])]
                break
        if sub == False:
            if selected_11.loc[i, 'position'] == 'GK':
                gk_count += 1
            elif selected_11.loc[i, 'position'] == 'DEF':
                def_count += 1
            elif selected_11.loc[i, 'position'] == 'MID':
                mid_count += 1
            elif selected_11.loc[i, 'position'] == 'FWD':
                fwd_count += 1
            team_count[selected_11.loc[i, 'team']] += 1
        if selected_11['cost'].sum() > BUDGET:
            print("budget reached")
            break
     
    return selected_11

selected_11 = substitute_for_points()
print(f"Final Selection: {gk_count} GK, {def_count} DEF, {fwd_count} FWD, {mid_count} MID")
print(f"Current budget spent: {selected_11['cost'].sum()}, remaining budget: {BUDGET - selected_11['cost'].sum()}")
print(team_count)
print("")
print(selected_11)
                

51 players have played more than 1450 minutes and selected for ppm algo
Substituted: ['Antoine Semenyo' 4.281150159744409] for  ['Mohamed Salah' 4.942736588306208]
Substituted: ['Lucas Digne' 5.587949465500486] for  ['Diogo Dalot Teixeira' 5.654578979717271]
Final Selection: 1 GK, 5 DEF, 1 FWD, 4 MID
Current budget spent: 638.0, remaining budget: 202.0
{'Arsenal': 1, 'Aston Villa': 0, 'Bournemouth': 1, 'Brentford': 0, 'Brighton': 0, 'Burnley': 0, 'Chelsea': 0, 'Crystal Palace': 1, 'Everton': 0, 'Fulham': 1, 'Liverpool': 3, 'Luton': 0, 'Man City': 2, 'Man Utd': 1, 'Newcastle': 0, "Nott'm Forest": 0, 'Sheffield Utd': 0, 'Spurs': 1, 'West Ham': 0, 'Wolves': 0}

    minutes total_points    ppg   cost first_name         second_name  \
308    1659           82  800.0  130.0    Mohamed               Salah   
662    2132          107  130.0   49.0      Adama              Traoré   
615    1463           78  170.0   48.0    Matheus  França de Oliveira   
377    1627           92  320.0   52.0   

In [254]:
# If there is still budget, excecute the maximise pointspergame algo

# Maximise points per game
# Sort Eval by ppg
# Substitute the lowest ppg player with the highest ppg player if it already does not exist
ppg = eval.copy(deep=True)
minutes_threshold = 150
ppg = ppg[ppg['minutes'] > minutes_threshold]
ppg = ppg.sort_values(by='ppg', ascending=False)
ppg = ppg[~ppg.index.isin(selected_11.index)]
print(f"{len(ppg)} players have played more than {minutes_threshold} minutes and selected for ppg algo")

def substitute_for_points():
    global ppg, selected_11, team_count, gk_count, def_count, fwd_count, mid_count
    selected_11 = selected_11.sort_values(by=['ppg', 'ppm', 'ppmcost'], ascending=True)
    # print(selected_11)

    # Substitute the lowest ppg player
    for i in selected_11.index:
        if selected_11.loc[i, 'position'] == 'GK':
            gk_count -= 1
        elif selected_11.loc[i, 'position'] == 'DEF':
            def_count -= 1
        elif selected_11.loc[i, 'position'] == 'MID':
            mid_count -= 1
        elif selected_11.loc[i, 'position'] == 'FWD':
            fwd_count -= 1
        team_count[selected_11.loc[i, 'team']] -= 1

        # Find the highest possible ppg player
        for j in ppg.index:
            sub = False
            if selected_11.loc[i, 'ppg'] > ppg.loc[j, 'ppg']:
                pass
            elif team_count[ppg.loc[j, 'team']] > 2:
                pass
            elif ppg.loc[j, 'cost'] + selected_11['cost'].sum() > BUDGET+20:
                pass
            elif ppg.loc[j, 'position'] == 'GK' and gk_count > 0:
                pass
            elif ppg.loc[j, 'position'] == 'DEF' and def_count > 4:
                pass
            elif ppg.loc[j, 'position'] == 'MID' and mid_count > 4:
                pass
            elif ppg.loc[j, 'position'] == 'FWD' and fwd_count > 2:
                pass
            else:
                # remove the player from selected_11
                sub = True
                print(f"Substituted: {selected_11.loc[i,['name','ppg']].values} for  {ppg.loc[j, ['name','ppg']].values}")
                selected_11 = selected_11[~selected_11.index.isin([i])]
                selected_11 = pd.concat([selected_11, ppg.loc[j, :].to_frame().T])
                selected_11 = selected_11.sort_values(by=['ppg', 'ppm', 'ppmcost'], ascending=True)
                team_count[ppg.loc[j, 'team']] += 1
                if ppg.loc[j, 'position'] == 'GK':
                    gk_count += 1
                elif ppg.loc[j, 'position'] == 'DEF':
                    def_count += 1
                elif ppg.loc[j, 'position'] == 'MID':
                    mid_count += 1
                elif ppg.loc[j, 'position'] == 'FWD':
                    fwd_count += 1
                ppg = ppg[~ppg.index.isin([j])]
                break
        if sub == False:
            if selected_11.loc[i, 'position'] == 'GK':
                gk_count += 1
            elif selected_11.loc[i, 'position'] == 'DEF':
                def_count += 1
            elif selected_11.loc[i, 'position'] == 'MID':
                mid_count += 1
            elif selected_11.loc[i, 'position'] == 'FWD':
                fwd_count += 1
            team_count[selected_11.loc[i, 'team']] += 1
        if selected_11['cost'].sum() > BUDGET:
            print("budget reached")
            break
     
    return selected_11

selected_11 = substitute_for_points()
print(f"Final Selection: {gk_count} GK, {def_count} DEF, {fwd_count} FWD, {mid_count} MID")
print(f"Current budget spent: {selected_11['cost'].sum()}, remaining budget: {BUDGET - selected_11['cost'].sum()}")
print(team_count)
print("")
print(selected_11)

178 players have played more than 150 minutes and selected for ppg algo
Substituted: ['Oliver Skipp' 110.00000000000001] for  ['Erling Haaland' 700.0]
Substituted: ['Adama Traoré' 130.0] for  ['Bukayo Saka' 680.0]
Substituted: ['Matheus França de Oliveira' 170.0] for  ['Douglas Luiz Soares de Paulo' 520.0]
Substituted: ['Ibrahima Konaté' 240.0] for  ['Pascal Groß' 509.99999999999994]
Substituted: ['Adam Smith' 250.0] for  ['Hwang Hee-chan' 490.00000000000006]
Substituted: ['Andrew Robertson' 270.0] for  ['Harry Maguire' 350.0]
Substituted: ['Eddie Nketiah' 280.0] for  ['William Saliba' 400.0]
Substituted: ['Diogo Dalot Teixeira' 320.0] for  ['Gabriel dos Santos Magalhães' 400.0]
Substituted: ['Ederson Santana de Moraes' 340.0] for  ['Alphonse Areola' 380.0]
Substituted: ['Nathan Aké' 370.0] for  ['Dominic Solanke' 509.99999999999994]
Final Selection: 1 GK, 3 DEF, 2 FWD, 5 MID
Current budget spent: 805.0, remaining budget: 35.0
{'Arsenal': 3, 'Aston Villa': 1, 'Bournemouth': 1, 'Brentfo

Total Model Team Points: 56 
- Harry - 4
- Areola -
- Saliba - 2
- Gabriel - 2
- Saka - 10 x 2
- Hee Chan
- Solanke - 2 
- Pascal - 7
- Douglas Luiz - 15
- Haaland - 5
- Salah - 0


In [70]:
general_data['elements']

[{'chance_of_playing_next_round': 0,
  'chance_of_playing_this_round': 0,
  'code': 232223,
  'cost_change_event': 0,
  'cost_change_event_fall': 0,
  'cost_change_start': -1,
  'cost_change_start_fall': 1,
  'dreamteam_count': 0,
  'element_type': 4,
  'ep_next': '0.0',
  'ep_this': '0.0',
  'event_points': 0,
  'first_name': 'Folarin',
  'form': '0.0',
  'id': 1,
  'in_dreamteam': False,
  'news': 'Transferred to Monaco',
  'news_added': '2023-08-31T08:55:15.272751Z',
  'now_cost': 44,
  'photo': '232223.jpg',
  'points_per_game': '0.0',
  'second_name': 'Balogun',
  'selected_by_percent': '0.2',
  'special': False,
  'squad_number': None,
  'status': 'u',
  'team': 1,
  'team_code': 3,
  'total_points': 0,
  'transfers_in': 10024,
  'transfers_in_event': 0,
  'transfers_out': 63059,
  'transfers_out_event': 13,
  'value_form': '0.0',
  'value_season': '0.0',
  'web_name': 'Balogun',
  'minutes': 0,
  'goals_scored': 0,
  'assists': 0,
  'clean_sheets': 0,
  'goals_conceded': 0,
  'o

# historical data

In [61]:

# import from data folder which is one directory up
df = pd.read_csv('../../data/2023-24/gws/merged_gw.csv')
# let the output width be fully shown
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print(df.columns)
print(df.head())
print(df.describe())

Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')
                     name position           team   xP  assists  bonus  bps  \
0             Femi Seriki      DEF  Sheffield Utd  0.5        0      0    0   
1        Jack Hinshelwood      MID       Brighton  1.5        0      0    0   
2            Jadon Sancho      MID        Man Utd  3.0        0      0    4   
3  Rhys Norrington-Davies 