# BUILDING A MACHINE LEARNING MODEL TO PREDICT MATCH RESULTS #

***From the previously scraped premier league data we will try to build a model to predict match results***

In [None]:
import pandas as pd

**Read in the data from the scraped file**

In [None]:
matches = pd.read_csv(r"C:\Users\Admin\Downloads\matches.csv", index_col=0)

**Display the data**

In [None]:
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2024,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2024,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2024,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2024,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2024,Manchester City


**Conduct descriptive Statistics**

In [None]:
matches.shape

(1520, 27)

In [None]:
matches['Team'].value_counts()

Manchester City             76
Crystal Palace              76
Southampton                 76
Leeds United                76
Leicester City              76
Everton                     76
West Ham United             76
Wolverhampton Wanderers     76
Arsenal                     76
Chelsea                     76
Brentford                   76
Tottenham Hotspur           76
Aston Villa                 76
Brighton and Hove Albion    76
Liverpool                   76
Newcastle United            76
Manchester United           76
Fulham                      38
Bournemouth                 38
Nottingham Forest           38
Burnley                     38
Watford                     38
Norwich City                38
Name: Team, dtype: int64

In [None]:
matches.dtypes

Date             object
Time             object
Comp             object
Round            object
Day              object
Venue            object
Result           object
GF                int64
GA                int64
Opponent         object
xG              float64
xGA             float64
Poss            float64
Attendance      float64
Captain          object
Formation        object
Referee          object
Match Report     object
Notes           float64
Sh              float64
SoT             float64
Dist            float64
FK              float64
PK                int64
PKatt             int64
Season            int64
Team             object
dtype: object

**Convert date column to datetime datatype**

In [None]:
matches['Date'] = pd.to_datetime(matches['Date'])

In [None]:
matches.dtypes

Date            datetime64[ns]
Time                    object
Comp                    object
Round                   object
Day                     object
Venue                   object
Result                  object
GF                       int64
GA                       int64
Opponent                object
xG                     float64
xGA                    float64
Poss                   float64
Attendance             float64
Captain                 object
Formation               object
Referee                 object
Match Report            object
Notes                  float64
Sh                     float64
SoT                    float64
Dist                   float64
FK                     float64
PK                       int64
PKatt                    int64
Season                   int64
Team                    object
dtype: object

**Convert into their desired datatypes**

In [None]:
matches['venue_code'] = matches['Venue'].astype('category').cat.codes

In [None]:
matches['opp_code'] = matches['Opponent'].astype('category').cat.codes

In [None]:
matches['hour'] = matches['Time'].str.replace(':.+', '', regex=True).astype('int')

In [None]:
matches['day_code'] = matches['Date'].dt.dayofweek

In [None]:
matches['target'] = (matches['Result'] == 'W').astype('int')

In [None]:
matches.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'Dist', 'FK', 'PK',
       'PKatt', 'Season', 'Team', 'venue_code', 'opp_code', 'hour', 'day_code',
       'target'],
      dtype='object')

**Train the model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches['Date'] < '2022-05-22']

In [None]:
test = matches[matches['Date'] > '2022-05-22']

In [None]:
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [None]:
rf.fit(train[predictors], train['target'])

**Run the predictions and conduct accuracy score measurement**

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test['target'], preds)

In [None]:
acc

0.7644736842105263

In [None]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

**Use the confusion matrix**

In [None]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,400,67
1,112,181


In [None]:
grouped_matches = matches.groupby('Team')

In [None]:
group = grouped_matches.get_group('Manchester City')

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('Date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


In [None]:
cols = ['GF', 'GA', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
new_cols = [f'{c}_rolling' for c in cols]

In [None]:
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [None]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,5,1,3.333333,0.333333,19.666667,6.000000,16.866667,0.666667,0.000000,0.000000
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,5,0,3.666667,0.000000,22.000000,7.333333,15.866667,0.333333,0.000000,0.000000
8,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Away,W,1,0,Chelsea,...,5,1,2.000000,0.000000,22.000000,6.333333,15.166667,0.333333,0.000000,0.000000
10,2021-10-03,16:30,Premier League,Matchweek 7,Sun,Away,D,2,2,Liverpool,...,6,0,0.666667,0.000000,18.666667,4.000000,15.933333,0.333333,0.000000,0.000000
11,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Home,W,2,0,Burnley,...,5,1,1.000000,0.666667,14.333333,2.333333,16.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Leeds United,...,5,1,3.000000,0.666667,13.666667,8.000000,15.433333,0.000000,0.333333,0.333333
54,2023-05-14,14:00,Premier League,Matchweek 36,Sun,Away,W,3,0,Everton,...,6,1,2.333333,0.666667,14.666667,7.000000,16.366667,0.666667,0.333333,0.666667
56,2023-05-21,16:00,Premier League,Matchweek 37,Sun,Home,W,1,0,Chelsea,...,6,1,2.666667,0.333333,14.000000,5.666667,18.100000,1.333333,0.000000,0.333333
57,2023-05-24,20:00,Premier League,Matchweek 32,Wed,Away,D,1,1,Brighton,...,2,0,2.000000,0.333333,13.666667,4.000000,18.933333,1.333333,0.000000,0.333333


In [None]:
matches_rolling = matches.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
matches_rolling = matches_rolling.droplevel('Team')

In [None]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1,0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1,0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3,1,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0,0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2,2,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,2023-04-29,15:00,Premier League,Matchweek 34,Sat,Away,L,0,6,Brighton,...,5,0,1.666667,0.666667,11.666667,4.666667,18.700000,0.666667,0.333333,0.333333
40,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,1,0,Aston Villa,...,5,1,1.000000,2.666667,11.333333,2.333333,18.800000,0.666667,0.333333,0.333333
41,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,...,5,0,1.000000,2.000000,8.000000,2.000000,17.766667,0.000000,0.333333,0.333333
42,2023-05-20,15:00,Premier League,Matchweek 37,Sat,Home,D,1,1,Everton,...,5,0,0.333333,2.666667,7.000000,1.333333,15.600000,0.000000,0.000000,0.000000


In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1,0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
1,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1,0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
2,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3,1,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
3,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0,0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
4,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2,2,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1446,2023-04-29,15:00,Premier League,Matchweek 34,Sat,Away,L,0,6,Brighton,...,5,0,1.666667,0.666667,11.666667,4.666667,18.700000,0.666667,0.333333,0.333333
1447,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,1,0,Aston Villa,...,5,1,1.000000,2.666667,11.333333,2.333333,18.800000,0.666667,0.333333,0.333333
1448,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,...,5,0,1.000000,2.000000,8.000000,2.000000,17.766667,0.000000,0.333333,0.333333
1449,2023-05-20,15:00,Premier League,Matchweek 37,Sat,Home,D,1,1,Everton,...,5,0,0.333333,2.666667,7.000000,1.333333,15.600000,0.000000,0.000000,0.000000


In [None]:
from sklearn.metrics import precision_score

In [None]:
def make_predictions(data, predictions):
    train = matches[matches['Date'] < '2022-05-22']
    test = matches[matches['Date'] > '2022-01-22']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [None]:
precision

0.5409836065573771

In [None]:
combined

Unnamed: 0,actual,prediction
1,1,0
2,1,0
3,0,1
4,1,0
5,1,1
...,...,...
38,0,0
39,0,0
40,0,0
41,0,0


In [None]:
combined = combined.merge(matches_rolling[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)

In [None]:
combined

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result
0,1,0,2021-09-11,Arsenal,Norwich City,W
0,0,0,2021-09-11,Arsenal,Norwich City,W
0,1,1,2021-09-11,Arsenal,Norwich City,W
0,1,0,2021-09-11,Arsenal,Norwich City,W
0,0,0,2021-09-11,Arsenal,Norwich City,W
...,...,...,...,...,...,...
59,0,0,2023-03-01,Arsenal,Everton,W
60,1,1,2023-03-04,Arsenal,Bournemouth,W
60,1,0,2023-03-04,Arsenal,Bournemouth,W
60,1,1,2023-03-04,Arsenal,Bournemouth,W


In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    'Brighton and Hove Albion': 'Brighton',
    'Manchester United': 'Manchester Utd',
    'Newcastle United': 'Newcastle Utd',
    'Tottenham Hotspur': 'Tottenham',
    'West Ham United': 'West Ham',
    'Wolverhampton Wanderers': 'Wolves'
}
mapping = MissingDict(**map_values)


In [None]:
mapping['West Ham United']

'West Ham'

In [None]:
combined['new_team'] = combined['Team'].map(mapping)

**Compare the predictions vs match results**

In [None]:
combined

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result,new_team
0,1,0,2021-09-11,Arsenal,Norwich City,W,Arsenal
0,0,0,2021-09-11,Arsenal,Norwich City,W,Arsenal
0,1,1,2021-09-11,Arsenal,Norwich City,W,Arsenal
0,1,0,2021-09-11,Arsenal,Norwich City,W,Arsenal
0,0,0,2021-09-11,Arsenal,Norwich City,W,Arsenal
...,...,...,...,...,...,...,...
59,0,0,2023-03-01,Arsenal,Everton,W,Arsenal
60,1,1,2023-03-04,Arsenal,Bournemouth,W,Arsenal
60,1,0,2023-03-04,Arsenal,Bournemouth,W,Arsenal
60,1,1,2023-03-04,Arsenal,Bournemouth,W,Arsenal


In [None]:
merged = combined.merge(combined, left_on=['Date', 'new_team'], right_on=['Date', 'Opponent'])


In [None]:
merged.shape

(0, 13)