# Making Predictions on Soccer Matches in the English Premier League using Machine Learning

Import the Python libraries `pandas`, `requests`, `BeautifulSoup`, and `sklearn`

In [144]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sklearn

Initialize the variables

In [145]:
matches = pd.read_csv("all_matches_1992_2022.csv", index_col=0)

In [146]:
# Find columns in DataFrame that are numeric, which can be used as predictors
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [147]:
# Convert `date` column to numeric using `pandas`
matches["date"] = pd.to_datetime(matches["date"])

matches["date"].dtype.name

'datetime64[ns]'

In [148]:
# Convert `venue` column to numeric value to create predictor for home-field advantage
matches["venue_code"] = matches["venue"].astype("category")

# Assign a numeric value to each category
matches["venue_code"] = matches["venue_code"].cat.codes

matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,venue_code
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,,10.0,2.0,14.6,1.0,0.0,0.0,2022,Arsenal,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,,19.0,7.0,13.0,0.0,0.0,0.0,2022,Arsenal,1
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,,14.0,6.0,14.8,0.0,0.0,0.0,2022,Arsenal,0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,,22.0,8.0,15.5,1.0,0.0,0.0,2022,Arsenal,1
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,,22.0,8.0,16.3,1.0,0.0,0.0,2022,Arsenal,1


In [149]:
# For each team, repeat the steps above to assign a numeric value for each opponent
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sh,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,10.0,2.0,14.6,1.0,0.0,0.0,2022,Arsenal,0,8
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,19.0,7.0,13.0,0.0,0.0,0.0,2022,Arsenal,1,13
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,14.0,6.0,14.8,0.0,0.0,0.0,2022,Arsenal,0,2
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,22.0,8.0,15.5,1.0,0.0,0.0,2022,Arsenal,1,10
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,22.0,8.0,16.3,1.0,0.0,0.0,2022,Arsenal,1,1


In [150]:
# Extract only the hour value from the `time` column in order to determine results of matches at different hours of the day
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sot,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,2.0,14.6,1.0,0.0,0.0,2022,Arsenal,0,8,20
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,7.0,13.0,0.0,0.0,0.0,2022,Arsenal,1,13,15
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,6.0,14.8,0.0,0.0,0.0,2022,Arsenal,0,2,17
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,8.0,15.5,1.0,0.0,0.0,2022,Arsenal,1,10,17
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,8.0,16.3,1.0,0.0,0.0,2022,Arsenal,1,1,19


In [151]:
# Assign a numeric value for each day of the week to determine results of matches at different days
matches["day_code"] = matches["date"].dt.dayofweek

matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,14.6,1.0,0.0,0.0,2022,Arsenal,0,8,20,4
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,13.0,0.0,0.0,0.0,2022,Arsenal,1,13,15,5
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,14.8,0.0,0.0,0.0,2022,Arsenal,0,2,17,5
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,15.5,1.0,0.0,0.0,2022,Arsenal,1,10,17,5
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,16.3,1.0,0.0,0.0,2022,Arsenal,1,1,19,2


In [152]:
# Assign a numeric value for the result (win/draw/loss)
matches["result_targets"] = matches["result"].astype("category").cat.codes

matches.head(20)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,result_targets
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,1.0,0.0,0.0,2022,Arsenal,0,8,20,4,2
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,0.0,0.0,0.0,2022,Arsenal,1,13,15,5,2
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,0.0,0.0,0.0,2022,Arsenal,0,2,17,5,2
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,1.0,0.0,0.0,2022,Arsenal,1,10,17,5,2
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,1.0,0.0,0.0,2022,Arsenal,1,1,19,2,2
5,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,1.0,0.0,0.0,2022,Arsenal,0,16,16,6,1
7,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,0.0,0.0,0.0,2022,Arsenal,0,3,12,6,2
8,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,1.0,0.0,0.0,2022,Arsenal,1,24,12,5,2
10,2022-10-09,16:30,Premier League,Matchweek 10,Sun,Home,W,3.0,2.0,Liverpool,...,0.0,1.0,1.0,2022,Arsenal,1,14,16,6,2
12,2022-10-16,14:00,Premier League,Matchweek 11,Sun,Away,W,1.0,0.0,Leeds United,...,1.0,0.0,0.0,2022,Arsenal,0,12,14,6,2


From `sklearn`, import a random forest classifier to make the initial predictions

In [153]:
from sklearn.ensemble import RandomForestClassifier as rfc

In [154]:
rf = rfc(n_estimators=50, min_samples_split=10, random_state=1)

Split the dataset into training and test data. The model will be trained on the training data, and predictions will be made on the test data.
For this, the training data will be all seasons preceding 2022 (current season), and the test data will be 2022 (current season)

In [155]:
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] >= '2022-01-01']

In [156]:
# Create a list of the predictors
predictors = ["venue_code", "opp_code", "hour", "day_code"]

Build a forest of trees from the training data using the `RandomForestClassifier.fit()` method. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.fit)

In [157]:
rf.fit(train[predictors], train["result_targets"])

Generate predictions using the `RandomForestClassifier.predict()` method. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.predict)

In [158]:
predictions = rf.predict(test[predictors])

Measure the precision of the predictions, using `accuracy_score`. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)

In [159]:
from sklearn.metrics import accuracy_score

Use `accuracy_score` to determine the accuracy of the model. 

In [160]:
accuracy = accuracy_score(test["result_targets"], predictions)

In [161]:
accuracy

0.4504643962848297

With the accuracy not being great, I will explore the dataset to see which data points were more/less accurate.

Create a new dataframe combining the test data and predictions

In [162]:
combined = pd.DataFrame(dict(actual=test["result_targets"], prediction=predictions))

In [163]:
# Use `pandas.crosstab()` method to create a table of predicted vs actual outcomes
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,63,66
1,30,133,88
2,23,85,143


The resulting table states that the model was more accurate when predicting wins/losses, but inaccurate when predicting draws

In [164]:
from sklearn.metrics import precision_score

In [165]:
precision_score(test["result_targets"], predictions, average=None)

array([0.22058824, 0.47330961, 0.48148148])

Looking from the `precision_score` above, the percentage of correct predictions are stated for draws, losses, wins respectively.

The model can be improved by computing rolling averages based off of the average team stats in the last N matches. These averages can give the model information about previous matches in order to make a better prediction.

Create a dataframe for the teams

In [166]:
grouped_matches = matches.groupby("team")

Define the `rolling_averages` function to take in a group, existing columns, and new columns

In [167]:
def rolling_averages(group, columns, new_columns):
    group = group.sort_values("date")
    
    # Take the set of columns and compute the rolling average for the previous 3 iterations.
    # closed='left' indicates that the current week will not be included
    rolling_stats = group[columns].rolling(3, closed='left').mean()
    
    group[new_columns] = rolling_stats
    
    # If data is not available (ex. 3 weeks of data not available for Week 2), remove the missing values from group
    group = group.dropna(subset=new_columns)
    
    return group

Define columns that will be informative for calculating rolling averages (numeric) and create new columns to hold value of rolling average for each of these columns.

In [168]:
columns = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_columns = [f"{column}_rolling" for column in columns]

For each team, apply the `rolling_averages` function for the columns defined

In [169]:
matches_rolling = matches.groupby("team").apply(lambda group: rolling_averages(group, columns, new_columns))

In [170]:
# Remove the left-most index of team-name that was added on from previous step
matches_rolling = matches_rolling.droplevel("team")

# Assign unique index numbers for each match
matches_rolling.index = range(matches_rolling.shape[0])

matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,result_targets,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2017-09-09,15:00,Premier League,Matchweek 4,Sat,Home,W,3.0,0.0,Bournemouth,...,5,2,1.333333,2.666667,17.666667,5.333333,18.133333,0.000000,0.000000,0.000000
1,2017-09-17,13:30,Premier League,Matchweek 5,Sun,Away,D,0.0,0.0,Chelsea,...,6,0,1.000000,1.666667,14.333333,5.000000,16.766667,0.333333,0.000000,0.000000
2,2017-09-25,20:00,Premier League,Matchweek 6,Mon,Home,W,2.0,0.0,West Brom,...,0,2,1.000000,1.333333,12.000000,3.666667,16.566667,0.333333,0.000000,0.000000
3,2017-10-01,12:00,Premier League,Matchweek 7,Sun,Home,W,2.0,0.0,Brighton,...,6,2,1.666667,0.000000,14.333333,5.333333,17.400000,1.333333,0.333333,0.333333
4,2017-10-14,17:30,Premier League,Matchweek 8,Sat,Away,L,1.0,2.0,Watford,...,5,1,1.333333,0.000000,17.000000,5.000000,18.333333,1.666667,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3951,2022-10-08,15:00,Premier League,Matchweek 10,Sat,Away,L,0.0,3.0,Chelsea,...,5,1,0.333333,1.666667,9.333333,2.333333,18.033333,0.666667,0.000000,0.000000
3952,2022-10-15,15:00,Premier League,Matchweek 11,Sat,Home,W,1.0,0.0,Nott'ham Forest,...,5,2,0.000000,2.666667,9.666667,2.333333,18.600000,1.000000,0.000000,0.000000
3953,2022-10-18,20:15,Premier League,Matchweek 12,Tue,Away,L,1.0,2.0,Crystal Palace,...,1,1,0.333333,1.666667,10.333333,2.333333,17.933333,0.333333,0.333333,0.333333
3954,2022-10-23,14:00,Premier League,Matchweek 13,Sun,Home,L,0.0,4.0,Leicester City,...,6,1,0.666667,1.666667,8.666667,2.333333,16.366667,0.666667,0.333333,0.333333


With these new predictors, the model can be re-trained to hopefully provide more accurate predictions.

In [185]:
# Define a function that will split the dataset into training and test sets, make predictions, and return predictions

def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] >= '2022-01-01']
    
    rf.fit(train[predictors], train["result_targets"])
    
    predictions = rf.predict(test[predictors])
    
    combined = pd.DataFrame(dict(actual=test["result_targets"], predicted=predictions), index=test.index)
    precision = precision_score(test["result_targets"], predictions, average=None)
    
    return combined, precision

In [186]:
combined, precision = make_predictions(matches_rolling, predictors + new_columns)

The precision of the model with the new rolling_averages predictors increased, but only slightly.

In [187]:
precision

array([0.22222222, 0.52411576, 0.49508197])

In [189]:
# Add the date, team, opponent, and results into the `combined` dataframe
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [190]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
168,1,2,2022-01-01,Arsenal,Manchester City,L
169,0,2,2022-01-23,Arsenal,Burnley,D
170,2,2,2022-02-10,Arsenal,Wolves,W
171,2,2,2022-02-19,Arsenal,Brentford,W
172,2,2,2022-02-24,Arsenal,Wolves,W
...,...,...,...,...,...,...
3951,1,1,2022-10-08,Wolverhampton Wanderers,Chelsea,L
3952,2,2,2022-10-15,Wolverhampton Wanderers,Nott'ham Forest,W
3953,1,1,2022-10-18,Wolverhampton Wanderers,Crystal Palace,L
3954,1,1,2022-10-23,Wolverhampton Wanderers,Leicester City,L


To fix the issue of having the model predict that both teams may win, there needs to be a filter for predictions where the model believes there will be 1 winner and 1 loser.

In [207]:
# The names in the opponent column differ slightly, so this dictionary matches the name from 'team' with 'opponent' where it differs

class MissingDict(dict):
    # If key does not exist in dictionary, then assign key as given value
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton", 
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

In [208]:
mapping["Tottenham Hotspur"]

'Tottenham'

In [210]:
mapping["Manchester City"]

'Manchester City'

Rename the team names in the 'team' column to align with the names in the 'opponent' column

In [211]:
combined["new_team_name"] = combined["team"].map(mapping)

In [212]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team_name
168,1,2,2022-01-01,Arsenal,Manchester City,L,Arsenal
169,0,2,2022-01-23,Arsenal,Burnley,D,Arsenal
170,2,2,2022-02-10,Arsenal,Wolves,W,Arsenal
171,2,2,2022-02-19,Arsenal,Brentford,W,Arsenal
172,2,2,2022-02-24,Arsenal,Wolves,W,Arsenal
...,...,...,...,...,...,...,...
3951,1,1,2022-10-08,Wolverhampton Wanderers,Chelsea,L,Wolves
3952,2,2,2022-10-15,Wolverhampton Wanderers,Nott'ham Forest,W,Wolves
3953,1,1,2022-10-18,Wolverhampton Wanderers,Crystal Palace,L,Wolves
3954,1,1,2022-10-23,Wolverhampton Wanderers,Leicester City,L,Wolves


Merge the dataframe onto itself, aligning `new_team_name` with `opponent` while matching `date` to merge a single match on each side.

In [213]:
merge = combined.merge(combined, left_on=["date", "new_team_name"], right_on=["date", "opponent"])

In [214]:
merge

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_name_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_name_y
0,1,2,2022-01-01,Arsenal,Manchester City,L,Arsenal,2,2,Manchester City,Arsenal,W,Manchester City
1,0,2,2022-01-23,Arsenal,Burnley,D,Arsenal,0,1,Burnley,Arsenal,D,Burnley
2,2,2,2022-02-10,Arsenal,Wolves,W,Arsenal,1,1,Wolverhampton Wanderers,Arsenal,L,Wolves
3,2,2,2022-02-19,Arsenal,Brentford,W,Arsenal,1,1,Brentford,Arsenal,L,Brentford
4,2,2,2022-02-24,Arsenal,Wolves,W,Arsenal,1,1,Wolverhampton Wanderers,Arsenal,L,Wolves
...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,1,1,2022-10-08,Wolverhampton Wanderers,Chelsea,L,Wolves,2,0,Chelsea,Wolves,W,Chelsea
626,2,2,2022-10-15,Wolverhampton Wanderers,Nott'ham Forest,W,Wolves,1,1,Nottingham Forest,Wolves,L,Nottingham Forest
627,1,1,2022-10-18,Wolverhampton Wanderers,Crystal Palace,L,Wolves,2,0,Crystal Palace,Wolves,W,Crystal Palace
628,1,1,2022-10-23,Wolverhampton Wanderers,Leicester City,L,Wolves,2,1,Leicester City,Wolves,W,Leicester City
