In [None]:
import calendar
from datetime import datetime, timedelta

import black
import pandas as pd
import seaborn as sns
from dateutil import parser
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier, RandomForestRegressor)
# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, make_scorer,  # regression metrics
                             mean_squared_error, r2_score)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC, SVR

import structure_data
from football_api import (filter_games, get_fixture_statistics, get_game_ids,
                          parse_scores)

pd.set_option("display.max_columns", None)

# 1. Future games (round 3)

In [None]:
future_games = structure_data.get_all_games(year=2025, use_api=False)
SELECTED_TEAM = "Arsenal"

In [None]:
def parse_date(input_date):
    """parses date as MM/DD/YYYY or day of week

    Parameters
    ----------
    input_date
        date time string

    Returns
    -------
        MM/DD/YYYY if greater than 7 days away otherwise day of week
    """
    # check if dt within 7 days
    dt = parser.isoparse(input_date)
    now = datetime.now().astimezone()
    if timedelta(0) <= (dt - now) <= timedelta(days=7):
        # day of week
        return calendar.day_name[dt.weekday()]
    # MM/DD/YY
    return f"{dt.month}/{dt.day}/{dt.year}"

In [None]:
def get_future_games(current_year_games):
    """get future games

    Parameters
    ----------
    current_year_games
        all current year games

    Returns
    -------
        dataframe: home, away, round, date
    """
    filtered_games = []
    for game in current_year_games:
        insert_game = True
        insert_game = insert_game and (
            game["fixture"]["status"]["long"] == "Not Started"
        )
        if insert_game:
            filtered_games.append(
                {
                    "home": game["teams"]["home"]["name"],
                    "away": game["teams"]["away"]["name"],
                    "round": game["league"]["round"],
                    "date": parse_date(game["fixture"]["date"]),
                    "id": game["fixture"]["id"],
                }
            )
    return pd.DataFrame(filtered_games).head(10)

In [None]:
get_future_games(future_games)

# 2. Predictions (round 3)
## a. create df

In [None]:
def parse_additional_stats(fixture_statistics, team_name):
    """parse additional stats

    Parameters
    ----------
    fixture_statistics
        fixture statistics
    team_name
        team name
    ignore_cols
        ignore columns

    Returns
    -------
        dictionary:
    """
    additional_stats = {}
    ignore_cols = ["goals_prevented"]
    for team_stat in fixture_statistics:
        if team_stat["team"]["name"] == team_name:
            for stat in team_stat["statistics"]:
                key_stat = "_".join(stat["type"].split()).lower().rstrip("_%")
                val_stat = stat["value"]
                if val_stat is None:
                    val_stat = 0
                elif isinstance(val_stat, str):
                    val_stat = float(val_stat.rstrip("%"))
                if key_stat not in ignore_cols:
                    additional_stats[key_stat] = val_stat
    return additional_stats


def generate_game_summary(team_games, ids, team_name):
    """generates game summary

    Parameters
    ----------
    team_games
        all team games
    ids
        game ids
    team_name
        team name

    Returns
    -------
        dataframe: home team, scores, result (win/loss/draw), opponent team
    """
    game_summary = {}
    # loop through games
    for game_id in ids:
        game = filter_games(team_games, game_id=game_id)[0]
        # home team, scores, result (win/loss/draw)
        is_home = game["teams"]["home"]["name"] == team_name
        scored = game["goals"]["home" if is_home else "away"]
        conceded = game["goals"]["away" if is_home else "home"]

        scored_half = game["score"]["halftime"]["home" if is_home else "away"]
        conceded_half = game["score"]["halftime"]["away" if is_home else "home"]
        winner = parse_scores(scored=scored, conceded=conceded, num=True)
        # opponent team name
        opponent = game["teams"]["away" if is_home else "home"]["name"]
        # get all statistics
        team_statistics = get_fixture_statistics(fixture_id=game_id, use_api=False)
        additional_stats = parse_additional_stats(
            fixture_statistics=team_statistics, team_name=team_name
        )

        # game results
        game_summary[str(game_id)] = {
            "is_home": is_home,
            "scored": scored,
            "conceded": conceded,
            "scored_half": scored_half,
            "conceded_half": conceded_half,
            "result": winner,
            "opponent_name": opponent,
        } | additional_stats

    return game_summary

## b. get past n prior games performance

In [None]:
def get_rolling_sum(df, n_prior, cols):
    """get rolling sum

    Parameters
    ----------
    df
        dataframe
    n_prior
        number of prior rows to sum
    cols
        column names list

    Returns
    -------
        dataframe: n_{col}
    """
    rolling_games = df.tail(n_prior)
    res = {}
    for col in cols:
        res["n_" + col] = (rolling_games[col]).sum()
    return res


def apply_rolling_sum_prior(row, df, cols, n_prior=5):
    """calls get_rolling_sum for prior row

    Parameters
    ----------
    row
        dataframe row
    df
        dataframe

    Returns
    -------
        series: n_{col}
    """
    stats = get_rolling_sum(
        df[df.index < row.name], n_prior=n_prior, cols=cols
    )  # only past matches
    return pd.Series(stats)

## c. concat multi-season df

In [None]:
def generate_team_report(season_games, team_name):
    """generate team report

    Parameters
    ----------
    season_games
        all games in season
    team_name
        team name

    Returns
    -------
        dataframe

    """
    # filter team games
    team_games = filter_games(games=season_games, team_name=team_name)
    # get game ids in ascending order of timestamp
    game_ids = get_game_ids(team_games)
    # generates game summaries:  home team, scores, game results, opponent name
    game_results = generate_game_summary(
        team_games=team_games, ids=game_ids, team_name=team_name
    )
    game_results = pd.DataFrame(game_results.values(), index=game_results.keys())

    # pd.set_option("display.max_colwidth", None)
    # pd.set_option("display.width", 10000)
    get_rolling_sum(df=game_results[:-1], n_prior=5, cols=["scored", "conceded"])

    cols_features = [
        # "scored",
        # "conceded",
        # "scored_half",
        # "conceded_half",
        # "result",
        "shots_on_goal",
        # "shots_off_goal",
        "total_shots",
        # "blocked_shots",
        "shots_insidebox",
        # "shots_outsidebox",
        # "fouls",
        # "corner_kicks",
        # "offsides",
        # "ball_possession",
        "yellow_cards",
        # "red_cards",
        # "goalkeeper_saves",
        # "total_passes",
        # "passes_accurate",
        # "passes",
        # "expected_goals",
    ]
    keep = ["scored"]
    game_results

    # game_results[["n_scored", "n_conceded"]] =
    prior_df = game_results.apply(
        lambda row: apply_rolling_sum_prior(row, game_results, cols_features), axis=1
    )
    prior_df[keep] = game_results[keep]

    return prior_df


# loop through all seasons
seasons = []
for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
    # get all Premier League games (from API or disk)
    games = structure_data.get_all_games(year=year, use_api=False)
    # generate team report
    seasons.append(generate_team_report(games, SELECTED_TEAM))
# merge all seasons
pre_df = pd.concat(seasons)
# games = structure_data.get_all_games(year=2023, use_api=False)
# generate_team_report(games, SELECTED_TEAM)

## d. select features for ML pipeline

In [None]:
# X
X = pre_df.drop(
    [
        "scored",
        # "n_passes_accurate",
        # "n_passes"
    ],
    axis=1,
)
# Y
y = pre_df["scored"]
sns.heatmap(X.corr())

## e. prediction pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
# Scaling/No Scaling Pipeline
col_name = X.columns.tolist()
scale_prior = ColumnTransformer(transformers=[("num", StandardScaler(), col_name)])
pass_only = ColumnTransformer(transformers=[("num", "passthrough", col_name)])

# Pipelines
pipelines = {}
# random forest
pipelines["random_forest"] = Pipeline(
    steps=[("prep", scale_prior), ("m", RandomForestClassifier())]
)
# logistic regression
pipelines["logistic_regression"] = Pipeline(
    steps=[("prep", scale_prior), ("m", LogisticRegression())]
)
# support vector
pipelines["support_vector"] = Pipeline(steps=[("prep", scale_prior), ("m", SVC())])
# k-nearest-neighbors
pipelines["k_nearest_neighbors"] = Pipeline(
    steps=[("prep", scale_prior), ("m", KNeighborsClassifier())]
)
# gradient boosting
pipelines["gradient_boosting"] = Pipeline(
    steps=[("prep", pass_only), ("m", GradientBoostingClassifier())]
)
results = []
fitted = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    fitted[name] = pipe
    y_pred = pipe.predict(X_test)
    y_pred = y_pred.round()
    # print(list(zip(y_pred,y_test)))
    score = accuracy_score(y_test, y_pred)

    results.append((name, score))
results

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
# Scaling/No Scaling Pipeline
col_name = X.columns.tolist()
scale_prior = ColumnTransformer(transformers=[("num", StandardScaler(), col_name)])
pass_only = ColumnTransformer(transformers=[("num", "passthrough", col_name)])
# Pipelines
pipelines = {}
# random forest
pipelines["random_forest"] = Pipeline(
    steps=[("prep", scale_prior), ("m", RandomForestRegressor())]
)
# logistic regression
pipelines["linear_regression"] = Pipeline(
    steps=[("prep", scale_prior), ("m", LinearRegression())]
)
# support vector
pipelines["support_vector"] = Pipeline(steps=[("prep", scale_prior), ("m", SVC())])
# k-nearest-neighbors
pipelines["k_nearest_neighbors"] = Pipeline(
    steps=[("prep", scale_prior), ("m", KNeighborsRegressor())]
)
# gradient boosting
pipelines["gradient_boosting"] = Pipeline(
    steps=[("prep", pass_only), ("m", GradientBoostingRegressor())]
)
results = []
fitted = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    fitted[name] = pipe
    y_pred = pipe.predict(X_test)
    y_pred = y_pred.round()
    # print(list(zip(y_pred,y_test)))
    score = mean_squared_error(y_test, y_pred)

    results.append((name, score))
results

In [None]:
future_games = structure_data.get_all_games(year=2025, use_api=False)
prediction = generate_team_report(future_games[:20], SELECTED_TEAM)
for name, pipe in pipelines.items():
    print(f"{name}, Prediction:{pipe.predict(prediction).round()[-1]}")

In [None]:
get_fixture_statistics(fixture_id=1378979, use_api=True)