In [4]:
import pandas as pd
matches = pd.read_csv("matches_long.csv", index_col = 0)
matches.shape
# total_game_count = 38*20*2 # our numbers align, good!
#matches["team"].value_counts() #this checks games per team

(1520, 27)

In [None]:
# cleaning the data, ensuring only numerical data

matches.dtypes #check data types of each col

In [None]:
matches["date"] = pd.to_datetime(matches["date"]) # changing the type of the "date" col
matches.dtypes

In [19]:
#changing to numerical categories, creating new columns w these numericla versions
matches["venue_code"] = matches["venue"].astype("category").cat.codes #convert venue to numerical (0 for away or 1 for home)
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") #change time to just the hour
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"]  = (matches["result"]=="W").astype("int")


matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,0.0,0,0,2024,Manchester City,0,5,20,4,1
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,0.0,0,0,2024,Manchester City,1,16,20,5,1
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,2.0,0,1,2024,Manchester City,0,18,14,6,1
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,0.0,1,1,2024,Manchester City,1,9,15,5,1
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,1.0,0,0,2024,Manchester City,0,21,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,L,1,3,Newcastle Utd,...,0.0,0,0,2023,Southampton,0,16,14,6,0
43,2023-05-08,20:00,Premier League,Matchweek 35,Mon,Away,L,3,4,Nott'ham Forest,...,0.0,1,1,2023,Southampton,0,17,20,0,0
44,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Home,L,0,2,Fulham,...,0.0,0,0,2023,Southampton,1,9,15,5,0
45,2023-05-21,14:00,Premier League,Matchweek 37,Sun,Away,L,1,3,Brighton,...,1.0,0,0,2023,Southampton,0,4,14,6,0


In [20]:
# Random Forect Classifier, good for picking up non-linearities!
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 50, min_samples_split = 10, random_state = 1) # numebr of estimators, and the higher the min_samples_split number is, the lower accuracy but less likely to overfit
train =  matches[matches["date"] < '2024-01-01'] # time series, so train has to be older data!
test = matches [matches["date"] > '2024-01-01']

predictors = ["venue_code", "opp_code", "hour", "day_code"] #attributes being used
rf.fit(train[predictors], train["target"]) # use the predictors to predict the target
preds = rf.predict(test[predictors])

from sklearn.metrics import accuracy_score
acc = accuracy_score(test["target"], preds)
acc # was 62%

ModuleNotFoundError: No module named 'sklearn'

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"]) # creating a prediction/actual table

In [None]:
from sklearn.metrics import precision_score
precision_score(test["target"], preds) # was 49.6% (want to about 75%)

In [None]:
# improve results by using rolling averages

grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City")
group

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() # ensures no future games are used, only previous 3 weeks
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [None]:
cols = ["gf", "ga","xg", "xga","poss","sh", "sot", "dist", "fk", "pk", "pkatt", "totdist", "prgdist", "err", "touches", "def 3rd", "att 3rd"] # add in all the detailed attributes
new_cols = [f"{c}_rolling" for c in cols] #create new cols of these rolling avgs

#new_cols
#rolling_averages(group, cols, new_cols) # just for city games

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling