In [14]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import glob

In [None]:
# Path where your CSVs are stored
path = "data/epl/"  

# Get all CSV files in the folder
files = glob.glob(path + "*.csv")

# Create a list of dataframes
dfs = []
for f in files:
    df = pd.read_csv(f)
    # Example: add season from filename (assuming file like "EPL_2019_2020.csv")
    season = f.split("_")[-2] + "/" + f.split("_")[-1].replace(".csv", "")
    df["Season"] = season
    dfs.append(df)

# Combine into one dataframe
epl_data = pd.concat(dfs, ignore_index=True)

print(epl_data.head())
print(epl_data.shape)

In [None]:
df=pd.read_csv('Training_DatasetV3.csv')
df['date_GMT'] = pd.to_datetime(df['date_GMT'], format='%b %d %Y - %I:%M%p')
df

In [None]:
df["home_team_code"] = df["home_team_name"].astype("category").cat.codes
df["away_team_code"] = df["away_team_name"].astype("category").cat.codes
df["stadium_code"] = df["stadium_name"].astype("category").cat.codes
df["day_of_week"] = df["date_GMT"].dt.dayofweek

df["target"] = df["Result"].map({"W": 1, "L": 0, "D": 2})

df

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10)
#rf= LogisticRegression()
train = df[df["date_GMT"] < '2022-08-05']
test = df[(df["date_GMT"] >= '2022-08-05') & (df["date_GMT"] < '2024-04-01')]
predictors = ["home_team_code", "away_team_code", "stadium_code", "day_of_week", "Game Week", 
              "home_ppg", "away_ppg" ]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])


In [None]:
error = accuracy_score(test["target"], preds)
print(error)

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])


In [None]:
future = df[df["date_GMT"] > '2024-04-01']
predictions=rf.predict(future[predictors])
future["Predictions"]=predictions
LiverPool=future[(future['home_team_name'] == 'Liverpool') | (future['away_team_name'] == 'Liverpool')]
Manchester_City	=future[(future['home_team_name'] == 'Manchester City') | (future['away_team_name'] == 'Manchester City')]
Arsenal=future[(future['home_team_name'] == 'Arsenal') | (future['away_team_name'] == 'Arsenal')]

In [None]:
# Liverpool points
L_value_counts = LiverPool['Predictions'].value_counts()
Liverpool_Points_from_Remaining_Matches = L_value_counts.get(1, 0) * 3 + L_value_counts.get(2, 0) * 1
print("Liverpool_Points_from_Remaining_Matches =", Liverpool_Points_from_Remaining_Matches)

# Manchester City points
M_value_counts = Manchester_City['Predictions'].value_counts()
Manchester_City_Points_from_Remaining_Matches = M_value_counts.get(1, 0) * 3 + M_value_counts.get(2, 0) * 1
print("Manchester_City_Points_from_Remaining_Matches =", Manchester_City_Points_from_Remaining_Matches)

# Arsenal points
A_value_counts = Arsenal['Predictions'].value_counts()
Arsenal_Points_from_Remaining_Matches = A_value_counts.get(1, 0) * 3
print("Arsenal_Points_from_Remaining_Matches =", Arsenal_Points_from_Remaining_Matches)

In [None]:
print("Liverpool_Points_End_of_League=",Liverpool_Points_from_Remaining_Matches + 67 )
print("Manchester_City_Points_End_of_League=", Manchester_City_Points_from_Remaining_Matches + 64 )
print("Arsenal_Points_End_of_League=",Arsenal_Points_from_Remaining_Matches + 65 )