<a href="https://colab.research.google.com/github/Rajie8861/PredictorMatchML/blob/main/PartML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd


In [2]:
matches = pd.read_csv("/content/matches_data/matches.csv")  # Update this path accordingly



In [3]:
matches.head()


Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
1,2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
2,3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
3,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
4,6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [5]:
matches.shape

(1389, 28)

In [6]:
38*20*2

1520

In [7]:
matches["team"].value_counts()

Unnamed: 0_level_0,count
team,Unnamed: 1_level_1
Southampton,72
Brighton and Hove Albion,72
Manchester United,72
West Ham United,72
Newcastle United,72
Burnley,71
Leeds United,71
Crystal Palace,71
Manchester City,71
Wolverhampton Wanderers,71


In [8]:
matches["round"].value_counts()

Unnamed: 0_level_0,count
round,Unnamed: 1_level_1
Matchweek 1,39
Matchweek 16,39
Matchweek 34,39
Matchweek 32,39
Matchweek 31,39
Matchweek 29,39
Matchweek 28,39
Matchweek 26,39
Matchweek 25,39
Matchweek 24,39


In [9]:
matches.dtypes

Unnamed: 0,0
Unnamed: 0,int64
date,object
time,object
comp,object
round,object
day,object
venue,object
result,object
gf,float64
ga,float64


In [10]:
matches["date"] = pd.to_datetime(matches["date"])

In [11]:
matches.dtypes

Unnamed: 0,0
Unnamed: 0,int64
date,datetime64[ns]
time,object
comp,object
round,object
day,object
venue,object
result,object
gf,float64
ga,float64


In [12]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes
#This will convert to a categorical data type in pandas, then convert into integers with cat.codes --> Convert from string -> Category -> to Numbers

In [13]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [14]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
# currently is ex 16:30 we want to replace the : and minutes with nothing. Then conver the hour into an integer.

In [15]:
matches["day_code"] = matches["date"].dt.dayofweek
#Gives a number for each day of the week --> Gets the day of week property

In [16]:
# Start to train the inital ML model
#Target did the team win or not

In [17]:
matches["target"] = (matches["result"] == "W").astype("int")
# This will look at the result column in the data and see if the team got a W for that game and then
# convert it to an integer so the ML model can use it this will turn False into a 0 --> L or D or True into a 1 --> W

In [18]:
# Train the model
from sklearn.ensemble import RandomForestClassifier
# type of ML that can pick up nonliniarities

In [19]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# n_est --> the # of individual decision trees we want each has slightly diff params, min_sample --> # of samples we want in the leaf of the tree before split of node
# ranndom_state --> If we run the forest multiple times the results stay same

In [20]:
train = matches[matches["date"] < '2022-01-01']
# take anything before 2022

In [21]:
test = matches[matches["date"] > '2022-01-01']
# anything in 2022

In [22]:
perdictors = ['venue_code', 'opp_code', 'hour', 'day_code']
# getting perdictors that we created

In [23]:
rf.fit(train[perdictors], train["target"])
# Fit the RandomForest Model and we train it with the perdictors trying to perdict the target

In [24]:
perds = rf.predict(test[perdictors])
# can perdict now with the perdict method passing in the test data and perdictors

In [25]:
# Figure out how to determine the accuracy for the test
from sklearn.metrics import accuracy_score
# looks at the % of time was your perdiction accurate

In [26]:
acc = accuracy_score(test["target"], perds)
#

In [27]:
acc

0.6123188405797102

In [28]:
# See in which situation was acc high vs low

In [29]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=perds))
#

In [30]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [31]:
from sklearn.metrics import precision_score

In [32]:
precision_score(test["target"], perds)

0.4745762711864407

In [33]:
grouped_matches = matches.groupby("team")

In [34]:
group = grouped_matches.get_group("Manchester City")

In [35]:
from os import close
# takes in a group, takes in a set of col we want to compute for
# takes in a set of new cols we want to assign the rolling avg to
def rolling_averages(group, cols, new_cols):
    # Sort by acs order to see the last 3 games played by team
    group = group.sort_values("date")
    rolling_states = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_states
    group = group.dropna(subset=new_cols)
    return group
#

In [36]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [37]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [38]:
rolling_averages(group, cols, new_cols)

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
632,5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,...,5,1,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
633,7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,...,5,0,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
634,9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,...,5,1,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
635,11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,...,6,0,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
636,12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,...,5,0,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,42,2022-03-14,20:00,Premier League,Matchweek 29,Mon,Away,D,0.0,0.0,...,0,0,2.333333,1.333333,19.000000,7.000000,15.366667,0.333333,0.333333,0.333333
29,44,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Away,W,2.0,0.0,...,5,1,1.666667,0.333333,18.333333,7.333333,16.000000,0.333333,0.000000,0.000000
30,46,2022-04-10,16:30,Premier League,Matchweek 32,Sun,Home,D,2.0,2.0,...,6,0,2.000000,0.333333,20.000000,6.666667,16.133333,0.333333,0.000000,0.000000
31,49,2022-04-20,20:00,Premier League,Matchweek 30,Wed,Home,W,3.0,0.0,...,2,1,1.333333,0.666667,15.666667,4.666667,16.700000,0.333333,0.000000,0.000000


In [39]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [40]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,898,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,...,6,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
Arsenal,899,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,...,5,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
Arsenal,900,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,...,6,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
Arsenal,901,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
Arsenal,902,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,...,6,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,227,32,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,W,1.0,0.0,...,6,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,228,33,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,229,34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,230,35,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [41]:
matches_rolling = matches_rolling.droplevel('team')

In [42]:
matches_rolling

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
898,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,...,6,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
899,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,...,5,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
900,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,...,6,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
901,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
902,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,...,6,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,32,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,W,1.0,0.0,...,6,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
228,33,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
229,34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
230,35,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [43]:
matches_rolling.index = range(matches_rolling.shape[0])
#unique values for each

In [44]:
matches_rolling

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,...,6,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
1,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,...,5,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
2,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,...,6,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
3,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
4,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,...,6,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,32,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,W,1.0,0.0,...,6,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
1313,33,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
1314,34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
1315,35,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [45]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [46]:
combined, precision = make_predictions(matches_rolling, perdictors + new_cols)

In [47]:
precision

0.625

In [48]:
combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1312,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W
1313,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L
1314,1,0,2022-04-02,Wolverhampton Wanderers,Aston Villa,W
1315,0,0,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L
