In [1]:
# import necessary packages
import time
import pandas as pd
import numpy as np

# import selenium dependencies
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium import webdriver

from webdriver_manager.chrome import ChromeDriverManager
from IPython.display import display, Image

# import beautifulsoup for parsing web html
from bs4 import BeautifulSoup

# import ML dependencies
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb

In [2]:
# create Selenium webdriver instance
options = Options()
#options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())

b = webdriver.Chrome(options=options, service=service)

In [3]:
# online databases to scrape
fbref_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
epl_sched_url = "https://www.goal.com/en-us/premier-league/fixtures-results/2kwbbcootiqqgmrzs6o5inle5"

In [4]:
# useful for debugging (get a snapshot of the screen) 
def show_screen(width, height):
    b.save_screenshot("out.png")
    b.set_window_size(width, height)
    display(Image("out.png"))

In [5]:
b.get(fbref_url) # open connection to fbref
time.sleep(5)
b.execute_script("window.scrollBy(0, 700);") # scroll down
time.sleep(3)
#show_screen(500,500)
epl_table = WebDriverWait(b, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#results2024-202591_overall")) # wait until table is visible
)
rows = epl_table.find_elements(By.TAG_NAME, "tr") # extract rows of table
table_data = []

# if first row contains headers, extract headers separately
headers = []
header_row = rows[0].find_elements(By.TAG_NAME, "th")
if header_row:
    headers = [header.text for header in header_row]

header_cols = len(headers) if headers else 0

# iterate through each row
for row in rows:
    # get all columns in this row
    cols = row.find_elements(By.TAG_NAME, "td")

    if cols:
        while len(cols) < header_cols:
            cols.append('')  # add empty string to match the header length
    
    # extract the text from each column (check for webelement objects)
    cols_data = [col.text if isinstance(col, webdriver.remote.webelement.WebElement) else col for col in cols]
    
    # add row data to the table data list
    table_data.append(cols_data)
    
# create pandas dataframe
df = pd.DataFrame(table_data, columns=headers if headers else None)
print(df)

                 Rk Squad    MP     W     D     L    GF    GA    GD   Pts  \
0              None  None  None  None  None  None  None  None  None  None   
1         Liverpool    29    21     7     1    69    27   +42    70  2.41   
2           Arsenal    28    15    10     3    52    24   +28    55  1.96   
3   Nott'ham Forest    28    15     6     7    45    33   +12    51  1.82   
4           Chelsea    28    14     7     7    53    36   +17    49  1.75   
5   Manchester City    28    14     5     9    53    38   +15    47  1.68   
6     Newcastle Utd    28    14     5     9    47    38    +9    47  1.68   
7          Brighton    28    12    10     6    46    40    +6    46  1.64   
8       Aston Villa    29    12     9     8    41    45    -4    45  1.55   
9       Bournemouth    28    12     8     8    47    34   +13    44  1.57   
10           Fulham    28    11     9     8    41    38    +3    42  1.50   
11   Crystal Palace    28    10     9     9    36    33    +3    39  1.39   

In [6]:
df.head()

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Last 5,Attendance,Top Team Scorer,Goalkeeper,Notes
0,,,,,,,,,,,,,,,,,,,,
1,Liverpool,29.0,21.0,7.0,1.0,69.0,27.0,42.0,70.0,2.41,65.0,25.1,39.8,1.37,W D W W W,60300.0,Mohamed Salah - 27,Alisson,,
2,Arsenal,28.0,15.0,10.0,3.0,52.0,24.0,28.0,55.0,1.96,43.8,24.4,19.4,0.69,W W L D D,60275.0,Kai Havertz - 9,David Raya,,
3,Nott'ham Forest,28.0,15.0,6.0,7.0,45.0,33.0,12.0,51.0,1.82,35.0,33.6,1.4,0.05,W L L D W,30080.0,Chris Wood - 18,Matz Sels,,
4,Chelsea,28.0,14.0,7.0,7.0,53.0,36.0,17.0,49.0,1.75,55.5,38.8,16.8,0.6,W L L W W,39610.0,Cole Palmer - 14,Robert Sánchez,,


In [7]:
# data cleaning and processing
df = df.drop(index=0) # drop first empty row
df.columns = df.columns[1:].tolist() + [df.columns[0]] # shift all columns to the left
df = df.iloc[:, :-2] # drop last two empty columns
df.head()

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Last 5,Attendance,Top Team Scorer,Goalkeeper
1,Liverpool,29,21,7,1,69,27,42,70,2.41,65.0,25.1,39.8,1.37,W D W W W,60300,Mohamed Salah - 27,Alisson
2,Arsenal,28,15,10,3,52,24,28,55,1.96,43.8,24.4,19.4,0.69,W W L D D,60275,Kai Havertz - 9,David Raya
3,Nott'ham Forest,28,15,6,7,45,33,12,51,1.82,35.0,33.6,1.4,0.05,W L L D W,30080,Chris Wood - 18,Matz Sels
4,Chelsea,28,14,7,7,53,36,17,49,1.75,55.5,38.8,16.8,0.6,W L L W W,39610,Cole Palmer - 14,Robert Sánchez
5,Manchester City,28,14,5,9,53,38,15,47,1.68,49.7,37.4,12.3,0.44,L W L W L,52897,Erling Haaland - 20,Ederson


In [8]:
# save this df as the league table
table = df
table.head()

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Last 5,Attendance,Top Team Scorer,Goalkeeper
1,Liverpool,29,21,7,1,69,27,42,70,2.41,65.0,25.1,39.8,1.37,W D W W W,60300,Mohamed Salah - 27,Alisson
2,Arsenal,28,15,10,3,52,24,28,55,1.96,43.8,24.4,19.4,0.69,W W L D D,60275,Kai Havertz - 9,David Raya
3,Nott'ham Forest,28,15,6,7,45,33,12,51,1.82,35.0,33.6,1.4,0.05,W L L D W,30080,Chris Wood - 18,Matz Sels
4,Chelsea,28,14,7,7,53,36,17,49,1.75,55.5,38.8,16.8,0.6,W L L W W,39610,Cole Palmer - 14,Robert Sánchez
5,Manchester City,28,14,5,9,53,38,15,47,1.68,49.7,37.4,12.3,0.44,L W L W L,52897,Erling Haaland - 20,Ederson


In [9]:
# normalize squad names in season table to match data later
team_name_mapping = {
    'Manchester Utd': 'Manchester United',
    "Nott'ham Forest": 'Nottingham Forest',
    'Ipswich Town': 'Ipswich',
    'Wolves': 'Wolverhampton',
    'Leicester City': 'Leicester',
    'Newcastle Utd': 'Newcastle'
}
table['Squad'] = table['Squad'].replace(team_name_mapping)
table

Unnamed: 0,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Last 5,Attendance,Top Team Scorer,Goalkeeper
1,Liverpool,29,21,7,1,69,27,42,70,2.41,65.0,25.1,39.8,1.37,W D W W W,60300,Mohamed Salah - 27,Alisson
2,Arsenal,28,15,10,3,52,24,28,55,1.96,43.8,24.4,19.4,0.69,W W L D D,60275,Kai Havertz - 9,David Raya
3,Nottingham Forest,28,15,6,7,45,33,12,51,1.82,35.0,33.6,1.4,0.05,W L L D W,30080,Chris Wood - 18,Matz Sels
4,Chelsea,28,14,7,7,53,36,17,49,1.75,55.5,38.8,16.8,0.6,W L L W W,39610,Cole Palmer - 14,Robert Sánchez
5,Manchester City,28,14,5,9,53,38,15,47,1.68,49.7,37.4,12.3,0.44,L W L W L,52897,Erling Haaland - 20,Ederson
6,Newcastle,28,14,5,9,47,38,9,47,1.68,46.3,35.4,10.9,0.39,L L W L W,52189,Alexander Isak - 19,Nick Pope
7,Brighton,28,12,10,6,46,40,6,46,1.64,41.6,39.3,2.3,0.08,L W W W W,32173,João Pedro - 8,Bart Verbruggen
8,Aston Villa,29,12,9,8,41,45,-4,45,1.55,41.2,40.0,1.1,0.04,D D W L W,41958,Ollie Watkins - 13,Emiliano Martínez
9,Bournemouth,28,12,8,8,47,34,13,44,1.57,51.3,37.9,13.4,0.48,L W L L D,11213,Justin Kluivert - 12,Kepa Arrizabalaga
10,Fulham,28,11,9,8,41,38,3,42,1.5,38.8,32.1,6.8,0.24,W W L W L,26550,Raúl Jiménez - 10,Bernd Leno


In [10]:
# start scraping goal.com for matchday data
b.get(epl_sched_url)
WebDriverWait(b, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) # wait for site to load
button_scroller = b.find_element(By.XPATH, '/html/body/div[1]/main/section[3]/div[3]/div[1]/div/div[1]/div/div/div')
b.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", button_scroller) # scroll to matchdays panel
matches = [] # for storing matches in the season

# function for scraping raw html for a given matchday number "m"
def get_matchday_data(m):
    try:
        matchday_button = WebDriverWait(b, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Game Week ' + str(m) + '")]')) # wait for matchday button to be clickable
        )
        b.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", matchday_button) # scroll until button is in clickable position
        time.sleep(6) 
        matchday_button.click()
        
        # wait for a few secs for matches to load
        time.sleep(3)
        
        # get current matchday element
        matchday_table = b.find_element(By.XPATH, "/html/body/div[1]/main/section[3]/div[3]/div[1]/div/div[2]")
        
        # return html for matchday
        return matchday_table.get_attribute('outerHTML')
    
    except Exception as e:
        print(f"Error occurred: {e}")

# function for taking a list of games and normalizing/extracting important data, appending to all matches list
def add_matchday_data(games):
    for match_day in games:
        date = match_day.find('span', class_='heading_name__Iq9xg').get_text(strip=True) # get date of match
        
        # loop through each match on the match-day
        match_wrappers = match_day.find_all('a', class_='match_match__rxaZt')
        for match_wrapper in match_wrappers:
            home_team = match_wrapper.find('span', class_='match_home-team__bdyXf').get_text(strip=True) # get home team
            away_team = match_wrapper.find('span', class_='match_away-team__t_cs1').get_text(strip=True) # get away team
            if match_wrapper.find('span', class_='match_score__EI49F'):
                score = match_wrapper.find('span', class_='match_score__EI49F').get_text(strip=True) # get score of game, if it has already been played
            else:
                score = None # unplayed games have no score yet
            
            # if match time exists, extract it; otherwise, skip time extraction
            match_time_tag = match_wrapper.find('time', class_='match_time__0YBq5')
            match_time = match_time_tag.get_text(strip=True) if match_time_tag else 'N/A'

            # add game to all matches list
            matches.append({
                'date': date,
                'home_team': home_team,
                'away_team': away_team,
                'score': score,
                'time': match_time
            })

# loop starting from matchday 1 to matchday 29 (upcoming matchday)
for m in range(1,30): 
    print("adding data for matchday " + str(m)) # tracking progress
    raw_html = get_matchday_data(m) # raw html returned for a single match day
    soup = BeautifulSoup(raw_html, 'html.parser') # parser object for html
    match_days = soup.find_all('div', class_='match-day_match-day__abKub') # collect matches data from parser
    add_matchday_data(match_days) # process matches for this matchday

adding data for matchday 1
adding data for matchday 2
adding data for matchday 3
adding data for matchday 4
adding data for matchday 5
adding data for matchday 6
adding data for matchday 7
adding data for matchday 8
adding data for matchday 9
adding data for matchday 10
adding data for matchday 11
adding data for matchday 12
adding data for matchday 13
adding data for matchday 14
adding data for matchday 15
adding data for matchday 16
adding data for matchday 17
adding data for matchday 18
adding data for matchday 19
adding data for matchday 20
adding data for matchday 21
adding data for matchday 22
adding data for matchday 23
adding data for matchday 24
adding data for matchday 25
adding data for matchday 26
adding data for matchday 27
adding data for matchday 28
adding data for matchday 29


In [11]:
# output the extracted matches
for match in matches:
    print(f"Date: {match['date']}, Home Team: {match['home_team']}, Away Team: {match['away_team']}, Score: {match['score']}, Time: {match['time']}")

Date: Friday 16 August, Home Team: Manchester United, Away Team: Fulham, Score: 1 - 0, Time: N/A
Date: Saturday 17 August, Home Team: Ipswich, Away Team: Liverpool, Score: 0 - 2, Time: N/A
Date: Saturday 17 August, Home Team: Nottingham Forest, Away Team: Bournemouth, Score: 1 - 1, Time: N/A
Date: Saturday 17 August, Home Team: Newcastle, Away Team: Southampton, Score: 1 - 0, Time: N/A
Date: Saturday 17 August, Home Team: Everton, Away Team: Brighton, Score: 0 - 3, Time: N/A
Date: Saturday 17 August, Home Team: Arsenal, Away Team: Wolverhampton, Score: 2 - 0, Time: N/A
Date: Saturday 17 August, Home Team: West Ham, Away Team: Aston Villa, Score: 1 - 2, Time: N/A
Date: Sunday 18 August, Home Team: Brentford, Away Team: Crystal Palace, Score: 2 - 1, Time: N/A
Date: Sunday 18 August, Home Team: Chelsea, Away Team: Manchester City, Score: 0 - 2, Time: N/A
Date: Monday 19 August, Home Team: Leicester, Away Team: Tottenham, Score: 1 - 1, Time: N/A
Date: Saturday 24 August, Home Team: Brighto

In [12]:
# save this df as the league schedule
schedule = pd.DataFrame(matches)
schedule

Unnamed: 0,date,home_team,away_team,score,time
0,Friday 16 August,Manchester United,Fulham,1 - 0,
1,Saturday 17 August,Ipswich,Liverpool,0 - 2,
2,Saturday 17 August,Nottingham Forest,Bournemouth,1 - 1,
3,Saturday 17 August,Newcastle,Southampton,1 - 0,
4,Saturday 17 August,Everton,Brighton,0 - 3,
...,...,...,...,...,...
285,Saturday 15 March,Bournemouth,Brentford,,12:30
286,Sunday 16 March,Fulham,Tottenham,,08:30
287,Sunday 16 March,Arsenal,Chelsea,,08:30
288,Sunday 16 March,Leicester,Manchester United,,14:00


In [13]:
# feature weight vector below
weights = np.array([1.3, 1.0, 1.8, 0.9, 0.7])

def get_team_features(team, season_table):
    # find the row for the team in the season table
    team_data = season_table[season_table['Squad'] == team].iloc[0]
    
    # calculate features for the team
    recent_form = 0  # calculate recent form
    for result in team_data['Last 5'].split(' '):
        if(result == "W"):
            recent_form += 3
        elif(result == "D"):
            recent_form += 1
    recent_form = float(recent_form/5) # form over the last 5 games
    season_form = float(team_data['Pts']) / int(team_data['MP']) # form over the whole season
    attacking_threat = float(float(team_data['xG']) / int(team_data['MP']))  # xG per game
    defensive_ability = float(team_data['xGA']) / int(team_data['MP'])  # xGA per game
    top_scorer = team_data['Top Team Scorer'].split(' - ')
    x_factor = float(int(top_scorer[1]) / int(team_data['MP']))  # top scorer's goals per game statistic

    # calculate and return weighted features
    weighted_features = np.array([
        recent_form * weights[0],
        season_form * weights[1],
        attacking_threat * weights[2],
        defensive_ability * weights[3],
        x_factor * weights[4]
    ])
    
    return {
        'recent_form': weighted_features[0],
        'season_form': weighted_features[1],
        'attacking_threat': weighted_features[2],
        'defensive_ability': weighted_features[3],
        'x_factor': weighted_features[4]
    }

In [14]:
# testing function with Liverpool
get_team_features("Liverpool",table)

{'recent_form': 3.3800000000000003,
 'season_form': 2.413793103448276,
 'attacking_threat': 4.0344827586206895,
 'defensive_ability': 0.7789655172413794,
 'x_factor': 0.6517241379310345}

In [15]:
# uses the data to create features and target vectors
def prepare_data(schedule, season_table):
    features = []
    target = []
    for idx, row in schedule.iterrows():
        if row['score'] != None:  # skip rows with no scores, we will predict those later
            home_team = row['home_team']
            away_team = row['away_team']

            # get features for the home and away teams
            home_features = get_team_features(home_team, season_table)
            away_features = get_team_features(away_team, season_table)

            # calculate the feature vector (combine home and away features)
            home_vector = np.array(list(home_features.values()))
            away_vector = np.array(list(away_features.values()))

            # combine the features into a single vector (home vs away)
            feature_vector = np.concatenate([home_vector, away_vector])
            features.append(feature_vector)

            # calculate the score difference as target (home goals - away goals)
            home_goals, away_goals = map(int, row['score'].split(' - '))
            target.append(home_goals - away_goals)
    
    # convert features and target into numpy arrays for machine learning
    features = np.array(features)
    target = np.array(target)

    return features, target

In [16]:
# get the numpy arrays for input to ML models
features, target = prepare_data(schedule, table)

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=42)

# feature scaling to normalize X splits
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# different models to explore
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor()
}

In [17]:
# function to evaluate different models
def evaluate_models(models, X_train, X_test, y_train, y_test):
    best_model = None
    best_mse = float('inf')
    
    for model_name, model in models.items():
        # train the model
        model.fit(X_train, y_train)
        
        # predict on test set
        y_pred = model.predict(X_test)
        
        # calculate MSE
        mse = mean_squared_error(y_test, y_pred)
        print(f"{model_name} - Mean Squared Error: {mse}")
        
        # track the best model
        if mse < best_mse:
            best_mse = mse
            best_model = model
    
    return best_model

# evaluate the models and select the best one
best_model = evaluate_models(models, X_train_scaled, X_test_scaled, y_train, y_test)

Linear Regression - Mean Squared Error: 1.8876167188597293
Ridge Regression - Mean Squared Error: 1.8803453651547006
Random Forest - Mean Squared Error: 2.278086206896552
Gradient Boosting - Mean Squared Error: 2.338239320245923
XGBoost - Mean Squared Error: 2.8725660581236667
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 10
[LightGBM] [Info] Start training from score 0.023810
LightGBM - Mean Squared Error: 2.4238230380689503


In [18]:
# define parameter grid for GridSearchCV
param_grids = {
    'Linear Regression': {},
    'Ridge Regression': {'alpha': [0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'LightGBM': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
}

# function for hyperparameter tuning with GridSearchCV
def hyperparameter_tuning(model_name, model, param_grid, X_train, y_train):
    print(f"Tuning hyperparameters for {model_name}")
    
    # perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # best model after tuning
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    print(f"Best hyperparameters for {model_name}: {best_params}")
    return best_model

In [19]:
# map model names to models
model_names = {
    LinearRegression: 'Linear Regression',
    Ridge: 'Ridge Regression',
    RandomForestRegressor: 'Random Forest',
    GradientBoostingRegressor: 'Gradient Boosting',
    xgb.XGBRegressor: 'XGBoost',
    lgb.LGBMRegressor: 'LightGBM'
}

# get the name of the best model (from step 1)
best_model_name = model_names[type(best_model)]

# tune hyperparameters based on best model
best_tuned_model = hyperparameter_tuning(
    best_model_name,
    best_model,
    param_grids[best_model_name],
    X_train,
    y_train
)

# get mean squared error after hyperparameter tuning
y_pred_tuned = best_tuned_model.predict(X_test)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
print(f"Mean Squared Error after Hyperparameter Tuning: {mse_tuned}")

Tuning hyperparameters for Ridge Regression
Best hyperparameters for Ridge Regression: {'alpha': 10}
Mean Squared Error after Hyperparameter Tuning: 1.8751756882290675


In [20]:
# optimize ridge regression which performed the best
best_ridge_model = Ridge(alpha=best_tuned_model.alpha) 
best_ridge_model.fit(X_train, y_train)

# predict scores for unplayed matches
def predict_match_score(row, season_table, model):
    # get features for the home and away teams
    home_features = get_team_features(row['home_team'], table)
    away_features = get_team_features(row['away_team'], table)
    
    # create feature vector for this matchup
    home_vector = np.array(list(home_features.values()))
    away_vector = np.array(list(away_features.values()))
    feature_vector = np.concatenate([home_vector, away_vector]).reshape(1, -1)
    
    # predict the score difference (home_goals - away_goals)
    score_diff = model.predict(feature_vector)[0]
    
    # predict goals (round to nearest integer)
    home_goals = round(max(score_diff, 0))  # If positive score_diff, home wins
    away_goals = round(max(-score_diff, 0))  # If negative score_diff, away wins
    
    return row['home_team'], row['away_team'], home_goals, away_goals

In [21]:
# predict the scores for all matches with no score
predicted_scores = []

for idx, row in schedule.iterrows():
    if row['score'] is None:  # predict if the score is 'None'
        home_team, away_team, home_goals, away_goals = predict_match_score(row, table, best_ridge_model)
        predicted_scores.append({'home_team': home_team, 'away_team': away_team, 'home_goals': home_goals, 'away_goals': away_goals})

# update the schedule with predicted scores
predicted_df = pd.DataFrame(predicted_scores)

# PREDICTED OUTCOMES FOR MATCHWEEK 29
predicted_df

Unnamed: 0,home_team,away_team,home_goals,away_goals
0,Southampton,Wolverhampton,0,1
1,Everton,West Ham,0,0
2,Manchester City,Brighton,0,0
3,Ipswich,Nottingham Forest,0,1
4,Bournemouth,Brentford,0,0
5,Fulham,Tottenham,0,0
6,Arsenal,Chelsea,0,0
7,Leicester,Manchester United,0,1
8,Newcastle,Crystal Palace,0,0


In [23]:
# ALTERNATIVELY, USE POISSON REGRESSOR FOR MORE REALISTIC PREDICTION OF ACTUAL SCORES (NOT NECESSARILY JUST OUTCOMES)

# updated function for preparing input data to Poisson model
def prepare_data(schedule, season_table):
    features = []
    home_goals = []
    away_goals = []
    
    for idx, row in schedule.iterrows():
        if row['score'] is not None:
            home_team = row['home_team']
            away_team = row['away_team']

            # get team features
            home_features = get_team_features(home_team, season_table)
            away_features = get_team_features(away_team, season_table)

            # convert to feature vector
            home_vector = np.array(list(home_features.values()))
            away_vector = np.array(list(away_features.values()))
            feature_vector = np.concatenate([home_vector, away_vector])
            features.append(feature_vector)

            # extract goal counts
            home_goals_count, away_goals_count = map(int, row['score'].split(' - '))
            home_goals.append(home_goals_count)
            away_goals.append(away_goals_count)
    
    # convert lists to numpy arrays
    features = np.array(features)
    home_goals = np.array(home_goals)
    away_goals = np.array(away_goals)

    return features, home_goals, away_goals

from sklearn.linear_model import PoissonRegressor

# prepare data
features, home_goals, away_goals = prepare_data(schedule, table)

# train-test split
X_train, X_test, y_train_home, y_test_home = train_test_split(features, home_goals, test_size=0.1, random_state=42)
X_train, X_test, y_train_away, y_test_away = train_test_split(features, away_goals, test_size=0.1, random_state=42)

# train Poisson models
home_goal_model = PoissonRegressor()
away_goal_model = PoissonRegressor()

home_goal_model.fit(X_train, y_train_home)
away_goal_model.fit(X_train, y_train_away)

# updated function for predicting home/away goals using Poisson model
def predict_match_score(row, season_table, home_model, away_model):
    # get team features
    home_features = get_team_features(row['home_team'], season_table)
    away_features = get_team_features(row['away_team'], season_table)

    # create feature vector
    home_vector = np.array(list(home_features.values()))
    away_vector = np.array(list(away_features.values()))
    feature_vector = np.concatenate([home_vector, away_vector]).reshape(1, -1)

    # predict expected goals for home and away teams
    expected_home_goals = home_model.predict(feature_vector)[0]
    expected_away_goals = away_model.predict(feature_vector)[0]

    # round to nearest integer
    home_goals = int(round(expected_home_goals))
    away_goals = int(round(expected_away_goals))

    return row['home_team'], row['away_team'], home_goals, away_goals

predicted_scores = []

for idx, row in schedule.iterrows():
    if row['score'] is None:  # predict only if score is missing
        home_team, away_team, home_goals, away_goals = predict_match_score(row, table, home_goal_model, away_goal_model)
        predicted_scores.append({'home_team': home_team, 'away_team': away_team, 'home_goals': home_goals, 'away_goals': away_goals})

# PREDICTED SCORES FOR MATCHWEEK 29 (USING POISSON)
predicted_df = pd.DataFrame(predicted_scores)
predicted_df

Unnamed: 0,home_team,away_team,home_goals,away_goals
0,Southampton,Wolverhampton,1,2
1,Everton,West Ham,1,1
2,Manchester City,Brighton,1,1
3,Ipswich,Nottingham Forest,1,2
4,Bournemouth,Brentford,2,1
5,Fulham,Tottenham,1,1
6,Arsenal,Chelsea,1,1
7,Leicester,Manchester United,1,2
8,Newcastle,Crystal Palace,1,2
