In [1]:
import soccerdata as sd
import pandas as pd
import numpy as np

In [2]:
fbref = sd.FBref(leagues="ENG-Premier League", seasons=2022)
print(fbref.__doc__)

Provides pd.DataFrames from data at http://fbref.com.

    Data will be downloaded as necessary and cached locally in
    ``~/soccerdata/data/FBref``.

    Parameters
    ----------
    leagues : string or iterable, optional
        IDs of leagues to include. For efficiently reading data from the Top-5
        European leagues, use "Big 5 European Leagues Combined".
    seasons : string, int or list, optional
        Seasons to include. Supports multiple formats.
        Examples: '16-17'; 2016; '2016-17'; [14, 15, 16]
    proxy : 'tor' or dict or list(dict) or callable, optional
        Use a proxy to hide your IP address. Valid options are:
            - "tor": Uses the Tor network. Tor should be running in
              the background on port 9050.
            - dict: A dictionary with the proxy to use. The dict should be
              a mapping of supported protocols to proxy addresses. For example::

                  {
                      'http': 'http://10.10.1.10:3128',
     

In [3]:
epl_schedule = fbref.read_schedule()
epl_schedule.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,week,day,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee,match_report,notes,game_id
league,season,game,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,/en/matches/e62f6e78/Crystal-Palace-Arsenal-Au...,,e62f6e78
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,1.0,Sat,2022-08-06,15:00,Bournemouth,0.6,2–0,0.7,Aston Villa,11013.0,Vitality Stadium,Peter Bankes,/en/matches/877e3193/Bournemouth-Aston-Villa-A...,,877e3193
ENG-Premier League,2223,2022-08-06 Everton-Chelsea,1.0,Sat,2022-08-06,17:30,Everton,0.7,0–1,1.5,Chelsea,39254.0,Goodison Park,Craig Pawson,/en/matches/3a917cee/Everton-Chelsea-August-6-...,,3a917cee
ENG-Premier League,2223,2022-08-06 Fulham-Liverpool,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,/en/matches/6713c1dc/Fulham-Liverpool-August-6...,,6713c1dc
ENG-Premier League,2223,2022-08-06 Leeds United-Wolves,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,/en/matches/82702941/Leeds-United-Wolverhampto...,,82702941


In [5]:
player_match_stats = fbref.read_player_match_stats(stat_type="misc", match_id='877e3193')
player_match_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,#,Nation,Pos,Age,Min,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Aerial Duels,Aerial Duels,Aerial Duels,game_id
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,CrdY,CrdR,2CrdY,Fls,Fld,...,Int,TklW,PKwon,PKcon,OG,Recov,Won,Lost,Won%,Unnamed: 25_level_1
league,season,game,team,player,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,Aston Villa,Boubacar Kamara,44.0,fr FRA,"CM,DM",22-256,81,0,0,0,5,2,...,0,1,0,0,0,10,3,1,75.0,877e3193
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,Aston Villa,Cameron Archer,35.0,eng ENG,AM,21-016,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,877e3193
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,Aston Villa,Danny Ings,9.0,eng ENG,FW,30-014,65,1,0,0,1,0,...,0,0,0,0,0,1,1,2,33.3,877e3193
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,Aston Villa,Diego Carlos,3.0,br BRA,CB,29-144,90,0,0,0,1,0,...,0,0,0,0,0,8,0,3,0.0,877e3193
ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,Aston Villa,Douglas Luiz,6.0,br BRA,DM,24-089,9,1,0,0,1,0,...,0,0,0,0,0,0,0,0,,877e3193


In [8]:
player_match_stats = fbref.read_player_match_stats(stat_type="misc")
player_match_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,#,Nation,Pos,Age,Min,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Aerial Duels,Aerial Duels,Aerial Duels,game_id
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,CrdY,CrdR,2CrdY,Fls,Fld,...,Int,TklW,PKwon,PKcon,OG,Recov,Won,Lost,Won%,Unnamed: 25_level_1
league,season,game,team,player,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Aaron Ramsdale,1.0,eng ENG,GK,24-083,90,0,0,0,0,0,...,0,0,0,0,0,1,1,0,100.0,e62f6e78
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Albert Sambi Lokonga,23.0,be BEL,"CM,RW",22-287,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,,e62f6e78
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Ben White,4.0,eng ENG,RB,24-301,90,1,0,0,2,3,...,1,5,0,0,0,6,0,0,,e62f6e78
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Bukayo Saka,7.0,eng ENG,RW,20-334,90,0,0,0,2,1,...,0,1,0,0,0,4,0,1,0.0,e62f6e78
ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Eddie Nketiah,14.0,eng ENG,FW,23-067,8,0,0,0,1,0,...,0,0,0,0,0,1,0,0,,e62f6e78


In [9]:
player_match_stats.to_csv('PlayerBookings.csv') 

In [4]:
epl_schedule.to_csv("MatchSchedule.csv")

In [2]:
pms = pd.read_csv("PlayerBookings.csv")
pms.head()

Unnamed: 0,league,season,game,team,player,#,Nation,Pos,Age,Min,...,Int,TklW,PKwon,PKcon,OG,Recov,Aerial Duels Won,Aerial Duels Lost,Aerial Duels Won %,game_id
0,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Aaron Ramsdale,1,eng ENG,GK,24-083,90,...,0,0,0,0,0,1,1,0,100.0,e62f6e78
1,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Albert Sambi Lokonga,23,be BEL,"CM,RW",22-287,1,...,0,0,0,0,0,1,0,0,,e62f6e78
2,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Ben White,4,eng ENG,RB,24-301,90,...,1,5,0,0,0,6,0,0,,e62f6e78
3,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Bukayo Saka,7,eng ENG,RW,20-334,90,...,0,1,0,0,0,4,0,1,0.0,e62f6e78
4,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Eddie Nketiah,14,eng ENG,FW,23-067,8,...,0,0,0,0,0,1,0,0,,e62f6e78


In [3]:
pms.dtypes

league                 object
season                  int64
game                   object
team                   object
player                 object
#                       int64
Nation                 object
Pos                    object
Age                    object
Min                     int64
CrdY                    int64
CrdR                    int64
2CrdY                   int64
Fls                     int64
Fld                     int64
Off                     int64
Crs                     int64
Int                     int64
TklW                    int64
PKwon                   int64
PKcon                   int64
OG                      int64
Recov                   int64
Aerial Duels Won        int64
Aerial Duels Lost       int64
Aerial Duels Won %    float64
game_id                object
dtype: object

In [13]:
epl_schedule = pd.read_csv("MatchSchedule.csv")
epl_schedule.dtypes

league           object
season            int64
game             object
week            float64
day              object
date             object
time             object
home_team        object
home_xg         float64
score            object
away_xg         float64
away_team        object
attendance      float64
venue            object
referee          object
match_report     object
notes            object
game_id          object
dtype: object

In [14]:
epl_schedule.head()

Unnamed: 0,league,season,game,week,day,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee,match_report,notes,game_id
0,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,/en/matches/e62f6e78/Crystal-Palace-Arsenal-Au...,,e62f6e78
1,ENG-Premier League,2223,2022-08-06 Bournemouth-Aston Villa,1.0,Sat,2022-08-06,15:00,Bournemouth,0.6,2–0,0.7,Aston Villa,11013.0,Vitality Stadium,Peter Bankes,/en/matches/877e3193/Bournemouth-Aston-Villa-A...,,877e3193
2,ENG-Premier League,2223,2022-08-06 Everton-Chelsea,1.0,Sat,2022-08-06,17:30,Everton,0.7,0–1,1.5,Chelsea,39254.0,Goodison Park,Craig Pawson,/en/matches/3a917cee/Everton-Chelsea-August-6-...,,3a917cee
3,ENG-Premier League,2223,2022-08-06 Fulham-Liverpool,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,/en/matches/6713c1dc/Fulham-Liverpool-August-6...,,6713c1dc
4,ENG-Premier League,2223,2022-08-06 Leeds United-Wolves,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,/en/matches/82702941/Leeds-United-Wolverhampto...,,82702941


In [15]:
epl_schedule.drop(['league', 'season', 'game', 'week', 'match_report', 'notes'], axis=1, inplace = True)
epl_schedule.head()

Unnamed: 0,day,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee,game_id
0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,e62f6e78
1,Sat,2022-08-06,15:00,Bournemouth,0.6,2–0,0.7,Aston Villa,11013.0,Vitality Stadium,Peter Bankes,877e3193
2,Sat,2022-08-06,17:30,Everton,0.7,0–1,1.5,Chelsea,39254.0,Goodison Park,Craig Pawson,3a917cee
3,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,6713c1dc
4,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,82702941


In [18]:
merge = pd.merge(pms, epl_schedule, on = 'game_id', how = 'left')
merge

Unnamed: 0,league,season,game,team,player,#,Nation,Pos,Age,Min,...,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee
0,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Aaron Ramsdale,1,eng ENG,GK,24-083,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Albert Sambi Lokonga,23,be BEL,"CM,RW",22-287,1,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
2,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Ben White,4,eng ENG,RB,24-301,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
3,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Bukayo Saka,7,eng ENG,RW,20-334,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
4,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Eddie Nketiah,14,eng ENG,FW,23-067,8,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5613,ENG-Premier League,2223,2023-01-19 Manchester City-Tottenham,Tottenham Hotspur,Richarlison,9,br BRA,AM,25-254,13,...,2023-01-19,20:00,Manchester City,2.3,4–2,0.8,Tottenham,53088.0,Etihad Stadium,Simon Hooper
5614,ENG-Premier League,2223,2023-01-19 Manchester City-Tottenham,Tottenham Hotspur,Rodrigo Bentancur,30,uy URU,CM,25-208,74,...,2023-01-19,20:00,Manchester City,2.3,4–2,0.8,Tottenham,53088.0,Etihad Stadium,Simon Hooper
5615,ENG-Premier League,2223,2023-01-19 Manchester City-Tottenham,Tottenham Hotspur,Ryan Sessegnon,19,eng ENG,WB,22-246,22,...,2023-01-19,20:00,Manchester City,2.3,4–2,0.8,Tottenham,53088.0,Etihad Stadium,Simon Hooper
5616,ENG-Premier League,2223,2023-01-19 Manchester City-Tottenham,Tottenham Hotspur,Son Heung-min,7,kr KOR,AM,30-195,90,...,2023-01-19,20:00,Manchester City,2.3,4–2,0.8,Tottenham,53088.0,Etihad Stadium,Simon Hooper


In [21]:
merge.head()

Unnamed: 0,league,season,game,team,player,#,Nation,Pos,Age,Min,...,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee
0,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Aaron Ramsdale,1,eng ENG,GK,24-083,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Albert Sambi Lokonga,23,be BEL,"CM,RW",22-287,1,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
2,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Ben White,4,eng ENG,RB,24-301,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
3,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Bukayo Saka,7,eng ENG,RW,20-334,90,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
4,ENG-Premier League,2223,2022-08-05 Crystal Palace-Arsenal,Arsenal,Eddie Nketiah,14,eng ENG,FW,23-067,8,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor


In [22]:
merge.dtypes


league                 object
season                  int64
game                   object
team                   object
player                 object
#                       int64
Nation                 object
Pos                    object
Age                    object
Min                     int64
CrdY                    int64
CrdR                    int64
2CrdY                   int64
Fls                     int64
Fld                     int64
Off                     int64
Crs                     int64
Int                     int64
TklW                    int64
PKwon                   int64
PKcon                   int64
OG                      int64
Recov                   int64
Aerial Duels Won        int64
Aerial Duels Lost       int64
Aerial Duels Won %    float64
game_id                object
day                    object
date                   object
time                   object
home_team              object
home_xg               float64
score                  object
away_xg   

In [23]:
merge.drop(['league', 'season', 'game', '#', 'Off', 'Crs', 'Int', 'PKwon', 'OG', 'Recov'], axis=1, inplace = True)
merge.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor


In [24]:
matches = merge
matches.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,date,time,home_team,home_xg,score,away_xg,away_team,attendance,venue,referee
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor


In [25]:
matches['date'] = pd.to_datetime(matches['date'])

In [26]:
matches.dtypes

team                          object
player                        object
Nation                        object
Pos                           object
Age                           object
Min                            int64
CrdY                           int64
CrdR                           int64
2CrdY                          int64
Fls                            int64
Fld                            int64
TklW                           int64
PKcon                          int64
Aerial Duels Won               int64
Aerial Duels Lost              int64
Aerial Duels Won %           float64
game_id                       object
day                           object
date                  datetime64[ns]
time                          object
home_team                     object
home_xg                      float64
score                         object
away_xg                      float64
away_team                     object
attendance                   float64
venue                         object
r

In [34]:
matches["home_code"] = matches["home_team"].astype("category").cat.codes
matches["away_code"] = matches["away_team"].astype("category").cat.codes
matches["team_code"] = matches["team"].astype("category").cat.codes
matches.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,score,away_xg,away_team,attendance,venue,referee,venue_code,home_code,away_code,team_code
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0


In [37]:
matches1 = matches

matches1["homeoraway_code"] = np.where((matches1['team_code'] == matches1['home_code']), 1, 0)
matches1.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,away_xg,away_team,attendance,venue,referee,venue_code,home_code,away_code,team_code,homeoraway_code
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0


In [38]:
matches1.to_csv("Matches1.csv")

In [42]:
matches1 = matches1.dropna(axis=0, subset = "time")
matches1["hour"] = matches1["time"].str.replace(":.+", "", regex=True).astype("int")
matches1.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,attendance,venue,referee,venue_code,home_code,away_code,team_code,homeoraway_code,hour,day_code
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4


In [43]:
matches = matches1
matches["day_code"] = matches["date"].dt.dayofweek
matches.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,attendance,venue,referee,venue_code,home_code,away_code,team_code,homeoraway_code,hour,day_code
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,25286.0,Selhurst Park,Anthony Taylor,Crystal Palace,6,0,0,0,20,4


In [44]:
matches["player_id_code"] = matches["player"].astype("category").cat.codes
matches["ref_code"] = matches["referee"].astype("category").cat.codes
matches["nation_code"] = matches["Nation"].astype("category").cat.codes
matches.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,venue_code,home_code,away_code,team_code,homeoraway_code,hour,day_code,player_id_code,ref_code,nation_code
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,Crystal Palace,6,0,0,0,20,4,2,2,18
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,Crystal Palace,6,0,0,0,20,4,12,2,5
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,Crystal Palace,6,0,0,0,20,4,46,2,18
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,Crystal Palace,6,0,0,0,20,4,62,2,18
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,Crystal Palace,6,0,0,0,20,4,132,2,18


In [45]:
matches.dtypes

team                          object
player                        object
Nation                        object
Pos                           object
Age                           object
Min                            int64
CrdY                           int64
CrdR                           int64
2CrdY                          int64
Fls                            int64
Fld                            int64
TklW                           int64
PKcon                          int64
Aerial Duels Won               int64
Aerial Duels Lost              int64
Aerial Duels Won %           float64
game_id                       object
day                           object
date                  datetime64[ns]
time                          object
home_team                     object
home_xg                      float64
score                         object
away_xg                      float64
away_team                     object
attendance                   float64
venue                         object
r

In [52]:
matches['target'] = np.where((matches['CrdY'] == 1) | (matches['CrdR'] == 1) | (matches['2CrdY'] == 1), 1, 0) 
matches.head()

Unnamed: 0,team,player,Nation,Pos,Age,Min,CrdY,CrdR,2CrdY,Fls,...,home_code,away_code,team_code,homeoraway_code,hour,day_code,player_id_code,ref_code,nation_code,target
0,Arsenal,Aaron Ramsdale,eng ENG,GK,24-083,90,0,0,0,0,...,6,0,0,0,20,4,2,2,18,0
1,Arsenal,Albert Sambi Lokonga,be BEL,"CM,RW",22-287,1,0,0,0,0,...,6,0,0,0,20,4,12,2,5,0
2,Arsenal,Ben White,eng ENG,RB,24-301,90,1,0,0,2,...,6,0,0,0,20,4,46,2,18,1
3,Arsenal,Bukayo Saka,eng ENG,RW,20-334,90,0,0,0,2,...,6,0,0,0,20,4,62,2,18,0
4,Arsenal,Eddie Nketiah,eng ENG,FW,23-067,8,0,0,0,1,...,6,0,0,0,20,4,132,2,18,0


In [55]:
matches.to_csv("MatchesReady.csv")

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_split=10, random_state=42)

In [56]:
train = matches[matches["date"] < "2022-12-25"]

In [57]:
test = matches[matches["date"] > "2022-12-25"]

In [58]:
predictors = ["home_code", "away_code", "team_code", "homeoraway_code", "hour", "day_code", "player_id_code", "ref_code", "nation_code" ]

In [59]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, random_state=42)

In [60]:
preds = rf.predict(test[predictors])

In [61]:
from sklearn.metrics import accuracy_score

In [63]:
acc = accuracy_score(test["target"], preds)
acc

0.86966640806827

In [65]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [66]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0
actual,Unnamed: 1_level_1
0,1121
1,168
