# Step 1 – Backfill

### Setting up Hopsworks

In [165]:
import hopsworks
import os

In [166]:
try:
    with open('./hopsworks/hopsworks-api-key.txt', 'r') as file:
        os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
except:
    print("In production mode")
    
project = hopsworks.login()
print(project.description)

2025-01-05 11:51:23,117 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-05 11:51:23,121 INFO: Initializing external client
2025-01-05 11:51:23,121 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 11:51:24,371 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159321
Default project


### Getting historical data

In [167]:
import requests
import pandas as pd
import numpy as np

In [168]:
bootstrap_url = "https://fantasy.premierleague.com/api/bootstrap-static/"
fixtures_url = "https://fantasy.premierleague.com/api/fixtures?futures={future}"
player_details_url = "https://fantasy.premierleague.com/api/element-summary/{element_id}"
player_stats_gw_url = "https://fantasy.premierleague.com/api/event/{gw}/live/"

## Fetching general data from https://fantasy.premierleague.com/api/bootstrap-static/

### From the general data, we take genreal information about the players, teams, and events (gameweeks).

In [169]:
general_data = requests.get(bootstrap_url).json()

In [170]:
upcoming_fixtures_data = requests.get(fixtures_url).json()

In [171]:
#played_fixtures_data = requests.get(fixtures_url.format(future=False)).json()

In [172]:
#element_id = 1  # Set the desired player element ID
#player_details_url_formatted = player_details_url.format(element_id=element_id)
#player_details = requests.get(player_details_url_formatted).json()

In [173]:
#events_df = pd.DataFrame(general_data["events"])
#events_df.columns

In [174]:
#players_df = pd.DataFrame(general_data["elements"])
#players_df.columns

In [175]:
#teams_df = pd.DataFrame(general_data["teams"])
#teams_df.columns

In [176]:
#upcoming_fixtures_df = pd.DataFrame(upcoming_fixtures_data)
#upcoming_fixtures_data

In [177]:
#player_details_fixtures_df = pd.DataFrame(player_details["fixtures"])
#player_details_history_df = pd.DataFrame(player_details["history"])
#player_details_history_past_df = pd.DataFrame(player_details["history_past"])

In [178]:
#player_details_fixtures_df.columns

In [179]:
#player_details_history_df.columns

In [180]:
#player_details_history_past_df.columns

# Backfill player stats for each game week

### Init vars

In [181]:
gameweek = 1
all_gameweek_data = []

### Loop through each gameweek and fetch player stats

### Finally concat into single dataframe

In [182]:
while True:
    player_stats_gw_url_formatted = player_stats_gw_url.format(gw=gameweek)
    player_stats = requests.get(player_stats_gw_url_formatted).json()

    if len(player_stats["elements"]) == 0 or gameweek > 19:
        print(f"No more data available after gameweek {gameweek - 1}")
        break

    player_stats_gw_df = pd.DataFrame(player_stats["elements"]).drop(columns=["explain", "modified"])
    player_stats_gw_df = pd.concat(
        [player_stats_gw_df.drop(columns=["stats"]), player_stats_gw_df["stats"].apply(pd.Series)], 
        axis=1
    )
    
    player_stats_gw_df["gameweek"] = gameweek
    
    all_gameweek_data.append(player_stats_gw_df)
    
    print(f"Fetched data for gameweek {gameweek}")
    
    gameweek += 1

all_gameweeks_df = pd.concat(all_gameweek_data, ignore_index=True)

print("All gameweek data fetched and combined.")

Fetched data for gameweek 1
Fetched data for gameweek 2
Fetched data for gameweek 3
Fetched data for gameweek 4
Fetched data for gameweek 5
Fetched data for gameweek 6
Fetched data for gameweek 7
Fetched data for gameweek 8
Fetched data for gameweek 9
Fetched data for gameweek 10
Fetched data for gameweek 11
Fetched data for gameweek 12
Fetched data for gameweek 13
Fetched data for gameweek 14
Fetched data for gameweek 15
Fetched data for gameweek 16
Fetched data for gameweek 17
Fetched data for gameweek 18
Fetched data for gameweek 19
No more data available after gameweek 19
All gameweek data fetched and combined.


## 

In [183]:
with open ('./state/prev_gameweek.txt', 'w') as file:
    file.write(str(gameweek-1))

In [184]:
all_gameweeks_df.head()

Unnamed: 0,id,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,...,threat,ict_index,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points,in_dreamteam,gameweek
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False,1
1,2,5,0,0,0,0,0,0,0,1,...,0.0,0.1,0,0.0,0.0,0.0,0.15,0,False,1
2,3,90,0,0,1,0,0,0,0,0,...,8.0,2.3,1,0.12,0.02,0.14,0.47,6,False,1
3,4,90,1,1,1,0,0,0,0,0,...,46.0,12.5,1,0.45,0.04,0.49,0.47,12,True,1
4,5,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False,1


In [185]:
lagged_features = ['minutes', 'goals_scored', 'assists', 'clean_sheets',
                   'goals_conceded', 'own_goals', 'penalties_saved',
                   'penalties_missed', 'yellow_cards', 'red_cards', 'saves',
                   'bonus', 'bps', 'influence', 'creativity', 'threat',
                   'ict_index', 'starts', 'expected_goals', 'expected_assists',
                   'expected_goal_involvements', 'expected_goals_conceded',
                   'in_dreamteam']

for col in lagged_features:
    all_gameweeks_df[f'prev_{col}'] = all_gameweeks_df.groupby('id')[col].shift(1)

all_gameweeks_df.drop(columns=lagged_features, inplace=True)

final_data = all_gameweeks_df.dropna(subset=[f'prev_{col}' for col in lagged_features] + ['total_points'])

In [186]:
final_data.head()

Unnamed: 0,id,total_points,gameweek,prev_minutes,prev_goals_scored,prev_assists,prev_clean_sheets,prev_goals_conceded,prev_own_goals,prev_penalties_saved,...,prev_influence,prev_creativity,prev_threat,prev_ict_index,prev_starts,prev_expected_goals,prev_expected_assists,prev_expected_goal_involvements,prev_expected_goals_conceded,prev_in_dreamteam
616,1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
617,2,0,2,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.8,0.0,0.1,0.0,0.0,0.0,0.0,0.15,False
618,3,6,2,90.0,0.0,0.0,1.0,0.0,0.0,0.0,...,13.8,1.4,8.0,2.3,1.0,0.12,0.02,0.14,0.47,False
619,4,2,2,90.0,1.0,1.0,1.0,0.0,0.0,0.0,...,54.8,24.1,46.0,12.5,1.0,0.45,0.04,0.49,0.47,True
620,5,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [187]:
final_data = final_data.astype({col: 'float64' for col in final_data.select_dtypes(include='object').columns})

In [188]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12097 entries, 616 to 12801
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               12097 non-null  int64  
 1   total_points                     12097 non-null  int64  
 2   gameweek                         12097 non-null  int64  
 3   prev_minutes                     12097 non-null  float64
 4   prev_goals_scored                12097 non-null  float64
 5   prev_assists                     12097 non-null  float64
 6   prev_clean_sheets                12097 non-null  float64
 7   prev_goals_conceded              12097 non-null  float64
 8   prev_own_goals                   12097 non-null  float64
 9   prev_penalties_saved             12097 non-null  float64
 10  prev_penalties_missed            12097 non-null  float64
 11  prev_yellow_cards                12097 non-null  float64
 12  prev_red_cards       

In [189]:
final_data.dropna(inplace=True)

In [190]:
fs = project.get_feature_store() 

In [191]:
player_fg = fs.get_or_create_feature_group(
    name="player_features",
    description="Player data for the Fantasy Premier League",
    primary_key=["id", "gameweek"],
    version=1,
)

In [192]:
player_fg.insert(final_data)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1159321/fs/1150024/fg/1393552


Uploading Dataframe: 100.00% |██████████| Rows 12097/12097 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: player_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1159321/jobs/named/player_features_1_offline_fg_materialization/executions


(Job('player_features_1_offline_fg_materialization', 'SPARK'), None)

In [193]:
player_fg.update_feature_description("id", "Player ID")
player_fg.update_feature_description("gameweek", "Gameweek")
player_fg.update_feature_description("total_points", "Total points of the player in the gameweek (label)")
player_fg.update_feature_description("prev_minutes", "Played minutes in the previous gameweek")
player_fg.update_feature_description("prev_goals_scored", "Goals scored in the previous gameweek")
player_fg.update_feature_description("prev_assists", "Assists in the previous gameweek")
player_fg.update_feature_description("prev_clean_sheets", "Clean sheets in the previous gameweek")
player_fg.update_feature_description("prev_goals_conceded", "Goals conceded in the previous gameweek")
player_fg.update_feature_description("prev_own_goals", "Own goals in the previous gameweek")
player_fg.update_feature_description("prev_penalties_saved", "Penalties saved in the previous gameweek")
player_fg.update_feature_description("prev_penalties_missed", "Penalties missed in the previous gameweek")
player_fg.update_feature_description("prev_yellow_cards", "Yellow cards in the previous gameweek")
player_fg.update_feature_description("prev_red_cards", "Red cards in the previous gameweek")
player_fg.update_feature_description("prev_saves", "Saves in the previous gameweek")
player_fg.update_feature_description("prev_bonus", "Bonus points in the previous gameweek")
player_fg.update_feature_description("prev_bps", "Bonus points system in the previous gameweek")
player_fg.update_feature_description("prev_influence", "Influence in the previous gameweek")
player_fg.update_feature_description("prev_creativity", "Creativity in the previous gameweek")
player_fg.update_feature_description("prev_threat", "Threat in the previous gameweek")
player_fg.update_feature_description("prev_ict_index", "ICT index in the previous gameweek")
player_fg.update_feature_description("prev_starts", "Player started in the previous gameweek")
player_fg.update_feature_description("prev_expected_goals", "Expected goals in the previous gameweek")
player_fg.update_feature_description("prev_expected_assists", "Expected assists in the previous gameweek")
player_fg.update_feature_description("prev_expected_goal_involvements", "Expected goal involvements in the previous gameweek")
player_fg.update_feature_description("prev_expected_goals_conceded", "Expected goals conceded in the previous gameweek")
player_fg.update_feature_description("prev_in_dreamteam", "Player was in the dreamteam in the previous gameweek")

<hsfs.feature_group.FeatureGroup at 0x16b3d96d0>