# English Football League Score Prediction

## Overview

---------------

In this notebook I will try and attempt to build a model to predict the outcome of english football league games

-----------------

## 1. Setup

### 1.1 Modules

In [8]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from data_processing.historical_data import HistoricaLeague
from data_processing.game_data import LeagueGame

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost
import joblib

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

### 2. Data

In [10]:
historical_league = HistoricaLeague()
historical_league_df = historical_league.get_data()

In [11]:
historical_league_df_reduced = historical_league_df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']].copy()

In [12]:
historical_league_df_reduced.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG
0,2024-08-16,Man United,Fulham,1,0
1,2024-08-17,Ipswich,Liverpool,0,2
2,2024-08-17,Arsenal,Wolves,2,0
3,2024-08-17,Everton,Brighton,0,3
4,2024-08-17,Newcastle,Southampton,1,0


In [13]:
def generate_game_data(row):
    league_game = LeagueGame(row['HomeTeam'], row['AwayTeam'], row['Date'], historical_league_df)
    league_game.generate_game_data()
    return [league_game.home_team_data.latest_scores, 
            league_game.home_team_data.home_scores, 
            league_game.home_team_data.away_scores, 
            league_game.away_team_data.latest_scores, 
            league_game.away_team_data.home_scores, 
            league_game.away_team_data.away_scores]



In [14]:
def remove_incomplete_histories_home(df):
    df = df.copy()
    df['home_home_len'] = df['home_home'].str.len()
    df['home_away_len'] = df['home_away'].str.len()
    df['home_latest_len'] = df['home_latest'].str.len()

    df = df[df['home_home_len'] == 5]
    df = df[df['home_away_len'] == 5]
    df = df[df['home_latest_len'] == 5]

    return df.iloc[:, 0:5]

In [15]:
def remove_incomplete_histories_away(df):
    df = df.copy()

    df['away_home_len'] = df['away_home'].str.len()
    df['away_away_len'] = df['away_away'].str.len()
    df['away_latest_len'] = df['away_latest'].str.len()

    df = df[df['away_home_len'] == 5]
    df = df[df['away_away_len'] == 5]
    df = df[df['away_latest_len'] == 5]

    return df.iloc[:, 0:5]

In [16]:
def split_list(list):
    return pd.Series(list)

In [17]:
historical_league_df_reduced[['home_latest', 
                              'home_home', 
                              'home_away', 
                              'away_latest', 
                              'away_home', 
                              'away_away'
                             ]] = historical_league_df_reduced.apply(generate_game_data, 
                                                                     axis=1, 
                                                                     result_type='expand')

In [18]:
home_team_df = historical_league_df_reduced[['HomeTeam', 'FTHG', 'home_latest', 'home_home', 'home_away']].copy()
away_team_df = historical_league_df_reduced[['AwayTeam', 'FTAG', 'away_latest', 'away_home', 'away_away']].copy()

In [19]:
home_team_df = remove_incomplete_histories_home(home_team_df)
away_team_df = remove_incomplete_histories_away(away_team_df)

In [20]:
home_team_df.head()

Unnamed: 0,HomeTeam,FTHG,home_latest,home_home,home_away
0,Man United,1,"[2, 3, 0, 0, 1]","[3, 0, 1, 4, 2]","[2, 0, 2, 3, 1]"
1,Ipswich,0,"[2, 2, 3, 1, 0]","[2, 1, 0, 3, 6]","[2, 3, 0, 1, 1]"
2,Arsenal,2,"[2, 1, 3, 3, 5]","[2, 3, 5, 0, 2]","[1, 3, 2, 3, 0]"
3,Everton,0,"[1, 1, 1, 1, 2]","[1, 1, 2, 2, 1]","[1, 1, 0, 1, 1]"
4,Newcastle,1,"[4, 2, 1, 4, 5]","[1, 5, 4, 1, 4]","[4, 2, 4, 0, 1]"


In [21]:
home_team_df[['latest_0', 'latest_1', 'latest_2', 'latest_3', 'latest_4']] = home_team_df['home_latest'].apply(split_list)
home_team_df[['home_0', 'home_1', 'home_2', 'home_3', 'home_4']] = home_team_df['home_home'].apply(split_list)
home_team_df[['away_0', 'away_1', 'away_2', 'away_3', 'away_4']] = home_team_df['home_away'].apply(split_list)
home_team_df = home_team_df.drop(['HomeTeam', 'home_latest', 'home_home', 'home_away'], axis=1)

In [22]:
home_team_df.head()

Unnamed: 0,FTHG,latest_0,latest_1,latest_2,latest_3,latest_4,home_0,home_1,home_2,home_3,home_4,away_0,away_1,away_2,away_3,away_4
0,1,2,3,0,0,1,3,0,1,4,2,2,0,2,3,1
1,0,2,2,3,1,0,2,1,0,3,6,2,3,0,1,1
2,2,2,1,3,3,5,2,3,5,0,2,1,3,2,3,0
3,0,1,1,1,1,2,1,1,2,2,1,1,1,0,1,1
4,1,4,2,1,4,5,1,5,4,1,4,4,2,4,0,1


In [23]:
away_team_df.head()

Unnamed: 0,AwayTeam,FTAG,away_latest,away_home,away_away
0,Fulham,0,"[4, 0, 0, 1, 1]","[0, 1, 1, 0, 3]","[4, 0, 2, 1, 3]"
1,Liverpool,2,"[2, 3, 4, 2, 0]","[2, 4, 0, 3, 2]","[3, 2, 0, 3, 2]"
2,Wolves,0,"[0, 1, 1, 2, 0]","[1, 2, 0, 0, 1]","[0, 1, 2, 1, 0]"
3,Brighton,3,"[0, 1, 1, 1, 0]","[0, 1, 1, 0, 0]","[1, 0, 1, 0, 1]"
4,Southampton,0,"[2, 0, 0, 1, 3]","[0, 3, 3, 2, 1]","[2, 0, 1, 0, 2]"


In [24]:
away_team_df[['latest_0', 'latest_1', 'latest_2', 'latest_3', 'latest_4']] = away_team_df['away_latest'].apply(split_list)
away_team_df[['home_0', 'home_1', 'home_2', 'home_3', 'home_4']] = away_team_df['away_latest'].apply(split_list)
away_team_df[['away_0', 'away_1', 'away_2', 'away_3', 'away_4']] = away_team_df['away_latest'].apply(split_list)
away_team_df = away_team_df.drop(['AwayTeam', 'away_latest', 'away_home', 'away_away'], axis=1)

In [25]:
away_team_df.head()

Unnamed: 0,FTAG,latest_0,latest_1,latest_2,latest_3,latest_4,home_0,home_1,home_2,home_3,home_4,away_0,away_1,away_2,away_3,away_4
0,0,4,0,0,1,1,4,0,0,1,1,4,0,0,1,1
1,2,2,3,4,2,0,2,3,4,2,0,2,3,4,2,0
2,0,0,1,1,2,0,0,1,1,2,0,0,1,1,2,0
3,3,0,1,1,1,0,0,1,1,1,0,0,1,1,1,0
4,0,2,0,0,1,3,2,0,0,1,3,2,0,0,1,3


## Model

In [27]:
def test_score(predictions, true_values):
    rounded_results = list(np.around(np.array(predictions),0))
    df = pd.DataFrame()
    df['predicted_scores'] = rounded_results
    df['true_values'] = true_values
    df['diff'] = df['predicted_scores'] - df['true_values']
    return len(df[df['diff'] != 0])/len(df)

In [28]:
def run_model(model_class, X_train, y_train, X_test, y_test):
    model = model_class
    model.fit(X_train, y_train)
    results = model.predict(X_test)
    acc = test_score(results, list(y_test))
    return model, acc

In [29]:
def process_model_data(home_team_df, away_team_df):
    y_home = home_team_df['FTHG']
    X_home = np.array(home_team_df.drop('FTHG',axis=1))
    y_away = away_team_df['FTAG']
    X_away = np.array(away_team_df.drop('FTAG',axis=1))

    X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(X_home, y_home, 
                                                                            test_size=0.2, 
                                                                            random_state=55)
    print('Home Split:')
    print(f'Train: {len(X_train_home)}')
    print(f'Test: {len(X_test_home)}')
    print('\n')

    X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(X_away, y_away, 
                                                                            test_size=0.2,
                                                                            random_state=55)

    print('Away Split:')
    print(f'Train:: {len(X_train_away)}')
    print(f'Test: {len(X_test_away)}')
    
    return X_train_home, X_test_home, y_train_home, y_test_home, X_train_away, X_test_away, y_train_away, y_test_away

In [30]:
def run_model_prediction(model_class, home_team_df, away_team_df):

    X_train_home, X_test_home, y_train_home, y_test_home, X_train_away, X_test_away, y_train_away, y_test_away = process_model_data(home_team_df, away_team_df)
    

    home_model, home_acc = run_model(model_class,
                                     X_train_home,
                                     y_train_home,
                                     X_test_home,
                                     y_test_home)
    print('Home Model Accurracy : ' + str(home_acc))


    away_model, away_acc = run_model(model_class,
                                     X_train_away,
                                     y_train_away,
                                     X_test_away,
                                     y_test_away)
    print('Away Model Accurracy : ' + str(away_acc))
    return home_model, away_model



### Linear Regression

In [32]:
model_class = LinearRegression()              
home_model, away_model = run_model_prediction(model_class, home_team_df, away_team_df)

Home Split:
Train: 3965
Test: 992


Away Split:
Train:: 3965
Test: 992
Home Model Accurracy : 0.6653225806451613
Away Model Accurracy : 0.6310483870967742


In [34]:
linear_check = input()

 no


In [None]:
if linear_check.lower() == 'yes':
   joblib.dump(home_model, '../score_models/league/linear/home_model.joblib')
   joblib.dump(away_model, '../score_models/league/linear/away_model.joblib')

#### XGBOOST

In [36]:
model_class = xgboost.XGBRegressor()              
home_model, away_model = run_model_prediction(model_class, home_team_df, away_team_df)

Home Split:
Train: 3965
Test: 992


Away Split:
Train:: 3965
Test: 992
Home Model Accurracy : 0.71875
Away Model Accurracy : 0.6491935483870968


In [38]:
xgboost_check = input()

 no


In [None]:
if xgboost_check.lower() == 'yes':
   joblib.dump(home_model, '../score_models/league/xgboost/home_model.joblib')
   joblib.dump(away_model, '../score_models/league/xgboost/away_model.joblib')

### Away Team

### Tensor Flow

In [40]:
X_train_home, X_test_home, y_train_home, y_test_home, X_train_away, X_test_away, y_train_away, y_test_away = process_model_data(home_team_df, away_team_df)

Home Split:
Train: 3965
Test: 992


Away Split:
Train:: 3965
Test: 992


In [42]:
def build_and_compile_model(norm, optimiser_value):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(optimiser_value))
  return model

In [44]:
def test_score_tensor(predictions, true_values):
    rounded_results = list(np.around(np.array(predictions),0))
    rounded_results = [0.0 if n <= 0 else n for n in rounded_results]  
    df = pd.DataFrame()
    df['predicted_scores'] = rounded_results
    df['true_values'] = list(true_values)
    df['diff'] = df['predicted_scores'] - df['true_values']
    return len(df[df['diff'] != 0])/len(df)

In [46]:
def run_tensor_regressor(X_train, y_train, X_test, y_test, epoch=100, verbose=0, validation_split=0.2, optimiser_value=0.001):

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.DataFrame(y_train)

    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(np.array(X_train))
    normalizer.mean.numpy()

    dnn_model = build_and_compile_model(normalizer, optimiser_value)

    history = dnn_model.fit(
        X_train,
        y_train,
        validation_split=0.2,
        verbose=0, epochs=epoch)


    test_predictions = dnn_model.predict(X_test).flatten()
    acc = test_score_tensor(test_predictions, y_test)
    print(f'Accuracy : {acc}' )
    return dnn_model

    

In [48]:
home_model = run_tensor_regressor(X_train_home, y_train_home, X_test_home, y_test_home, epoch=200, optimiser_value=0.0045)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy : 0.7358870967741935


In [62]:
away_model = run_tensor_regressor(X_train_away, y_train_away, X_test_away, y_test_away, epoch=200, optimiser_value=0.001)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Accuracy : 0.6673387096774194


In [64]:
tensor_check = input()

 yes


In [68]:
if tensor_check.lower() == 'yes':
   joblib.dump(home_model, '../score_models/league/tensorflow/home_model.joblib')
   joblib.dump(away_model, '../score_models/league/tensorflow/away_model.joblib')