# Step 1 – Backfill

### Setting up Hopsworks

In [185]:
import hopsworks
import os
import datetime

In [None]:
try:
    with open('./hopsworks/hopsworks-api-key.txt', 'r') as file:
        os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
except:
    print("In production mode")

project = hopsworks.login()
print(project.description)

### Getting historical data

In [187]:
import requests
import pandas as pd
import numpy as np
from functions import util
from importlib import reload

In [188]:
bootstrap_url = "https://fantasy.premierleague.com/api/bootstrap-static/"
fixtures_url = "https://fantasy.premierleague.com/api/fixtures?futures=1"
player_details_url = "https://fantasy.premierleague.com/api/element-summary/{element_id}"
player_stats_gw_url = "https://fantasy.premierleague.com/api/event/{gw}/live/"

## Fetching general data from https://fantasy.premierleague.com/api/bootstrap-static/

### From the general data, we take genreal information about the players, teams, and events (gameweeks).

In [189]:
general_data = requests.get(bootstrap_url).json()

In [190]:
upcoming_fixtures_data = requests.get(fixtures_url).json()

# Backfill player stats for each game week

### Init vars

In [191]:
gameweek = 1
all_gameweek_data = []

In [None]:
upcoming_fixtures = pd.DataFrame(upcoming_fixtures_data)
upcoming_fixtures.dropna(subset=["event"], inplace=True)
upcoming_fixtures

In [None]:
general_stats = pd.DataFrame(general_data["elements"])
general_stats

### Loop through each gameweek and fetch player stats

### Finally concat into single dataframe

In [None]:
general_stats["element_type"]

In [None]:
while True:
    player_stats_gw_url_formatted = player_stats_gw_url.format(gw=gameweek)
    player_stats = requests.get(player_stats_gw_url_formatted).json()

    # Get the current gameweek from the helper function
    current_gameweek = util.get_gameweek_from_date(datetime.datetime.now())

    # Only fetch data if there are elements in the response and the gameweek is not in the future
    if len(player_stats["elements"]) == 0 or gameweek > 18:
        print(f"No more data available after gameweek {gameweek - 1}")
        break

    # Create a DataFrame from the player stats data
    player_stats_gw_df = pd.DataFrame(player_stats["elements"]).drop(columns=["explain", "modified"])
    player_stats_gw_df = pd.concat(
        [player_stats_gw_df.drop(columns=["stats"]), player_stats_gw_df["stats"].apply(pd.Series)],
        axis=1
    )

    player_stats_gw_df["gameweek"] = gameweek

    all_gameweek_data.append(player_stats_gw_df)

    print(f"Fetched data for gameweek {gameweek}")

    gameweek += 1


all_gameweeks_df = pd.concat(all_gameweek_data, ignore_index=True)
all_gameweeks_df.rename(columns={'total_points': 'points'}, inplace=True)

to_drop = general_stats[general_stats["can_select"] == False]["id"].tolist()

all_gameweeks_df = all_gameweeks_df[~all_gameweeks_df["id"].isin(to_drop)]

print("All gameweek data fetched and combined.")

## 

In [None]:
all_gameweeks_df.head()

In [None]:
# Collect player total points
for i in range(1, len(all_gameweeks_df['id'].unique()) + 1):

    # Get player details
    player_details_url_formatted = player_details_url.format(element_id=i)
    player_details = requests.get(player_details_url_formatted).json()

    player_details_history_df = pd.DataFrame(player_details["history"])
    #player_details_fixtures_df = pd.DataFrame(player_details["fixtures"])


    total_points = 0

    for _, row in player_details_history_df.iterrows():
        # Find matching rows in all_gameweeks_df
        condition = (all_gameweeks_df['id'] == i) & \
                    (all_gameweeks_df['gameweek'] == row['round'])

        total_points += row['total_points']

        all_gameweeks_df.loc[condition, 'total_points'] = total_points
        all_gameweeks_df.loc[condition, 'selected'] = row['selected']
        all_gameweeks_df.loc[condition, 'transfers_balance'] = row['transfers_balance']
        all_gameweeks_df.loc[condition, 'value'] = row['value']
        all_gameweeks_df.loc[condition, 'was_home'] = row['was_home']
        all_gameweeks_df.loc[condition, 'opponent_team'] = row['opponent_team']

In [198]:
lagged_features = ['minutes', 'goals_scored', 'assists', 'clean_sheets',
                   'goals_conceded', 'own_goals', 'penalties_saved',
                   'penalties_missed', 'yellow_cards', 'red_cards', 'saves',
                   'bonus', 'bps', 'influence', 'creativity', 'threat',
                   'ict_index', 'starts', 'expected_goals', 'expected_assists',
                   'expected_goal_involvements', 'expected_goals_conceded',
                   'in_dreamteam', 'total_points', 'selected', 'transfers_balance',
                   'value', 'was_home', 'opponent_team']

for col in lagged_features:
    # Create a new column with the prefix 'prev_' containing the lagged values of the original column
    all_gameweeks_df[f'prev_{col}'] = all_gameweeks_df.groupby('id')[col].shift(1)

# Merge the general stats DataFrame with the all_gameweeks_df DataFrame
all_gameweeks_df = all_gameweeks_df.merge(general_stats[["id","element_type"]], on="id", how="left")

# Drop original columns
# Remove the original columns from the DataFrame, keeping only the lagged features
all_gameweeks_df.drop(columns=lagged_features, inplace=True)

# Drop rows with missing values in lagged features or 'points'
# Remove rows that have NaN values in any of the lagged feature columns or the 'points' column
final_data = all_gameweeks_df.dropna(subset=[f'prev_{col}' for col in lagged_features] + ['points'])

In [None]:
final_data.head()

In [200]:
final_data = final_data.astype({col: 'float64' for col in final_data.select_dtypes(include='object').columns})

In [None]:
final_data.info()

In [202]:
final_data.dropna(inplace=True)

In [203]:
fs = project.get_feature_store()

In [204]:
player_fg = fs.get_or_create_feature_group(
    name="player_features",
    description="Player data for the Fantasy Premier League",
    primary_key=["id", "gameweek"],
    version=1,
)

In [None]:
player_fg.insert(final_data)

In [None]:
player_fg.update_feature_description("id", "Player ID")
player_fg.update_feature_description("gameweek", "Gameweek")
player_fg.update_feature_description("points", "Total points of the player in the gameweek (label)")
player_fg.update_feature_description("prev_total_points", "Total points accumulated by player up until the previous gameweek")
player_fg.update_feature_description("prev_minutes", "Played minutes in the previous gameweek")
player_fg.update_feature_description("prev_goals_scored", "Goals scored in the previous gameweek")
player_fg.update_feature_description("prev_assists", "Assists in the previous gameweek")
player_fg.update_feature_description("prev_clean_sheets", "Clean sheets in the previous gameweek")
player_fg.update_feature_description("prev_goals_conceded", "Goals conceded in the previous gameweek")
player_fg.update_feature_description("prev_own_goals", "Own goals in the previous gameweek")
player_fg.update_feature_description("prev_penalties_saved", "Penalties saved in the previous gameweek")
player_fg.update_feature_description("prev_penalties_missed", "Penalties missed in the previous gameweek")
player_fg.update_feature_description("prev_yellow_cards", "Yellow cards in the previous gameweek")
player_fg.update_feature_description("prev_red_cards", "Red cards in the previous gameweek")
player_fg.update_feature_description("prev_saves", "Saves in the previous gameweek")
player_fg.update_feature_description("prev_bonus", "Bonus points in the previous gameweek")
player_fg.update_feature_description("prev_bps", "Bonus points system in the previous gameweek")
player_fg.update_feature_description("prev_influence", "Influence in the previous gameweek")
player_fg.update_feature_description("prev_creativity", "Creativity in the previous gameweek")
player_fg.update_feature_description("prev_threat", "Threat in the previous gameweek")
player_fg.update_feature_description("prev_ict_index", "ICT index in the previous gameweek")
player_fg.update_feature_description("prev_starts", "Player started in the previous gameweek")
player_fg.update_feature_description("prev_expected_goals", "Expected goals in the previous gameweek")
player_fg.update_feature_description("prev_expected_assists", "Expected assists in the previous gameweek")
player_fg.update_feature_description("prev_expected_goal_involvements", "Expected goal involvements in the previous gameweek")
player_fg.update_feature_description("prev_expected_goals_conceded", "Expected goals conceded in the previous gameweek")
player_fg.update_feature_description("prev_selected", "Amount of players that selected the player in the previous gameweek")
player_fg.update_feature_description("prev_transfers_balance", "Sum of transfers in and out in the previous gameweek")
player_fg.update_feature_description("prev_value", "Value of the player up until the previous gameweek")
player_fg.update_feature_description("prev_was_home", "Whether the player played at home in the previous gameweek")
player_fg.update_feature_description("prev_opponent_team", "Opponent team in the previous gameweek")
player_fg.update_feature_description("element_type", "Player position")