In [None]:
import pandas as pd
matches = pd.read_csv("data.csv", index_col=0) #specify to pandas that the first col is the index col
# matches["Round"].value_counts()

matches.dtypes #ML doesnt work well with Object types that arent numbers (float, int)
matches["Date"] = pd.to_datetime(matches["Date"]) #rewrites the date column so it is no longer an object, but a datetime (can be used easily for ML)
matches["Venue_Code"] = matches["Venue"].astype("category").cat.codes #turns venue into a category type, then turns it into an int which we can use to account for Home Field Advantage
matches["Opponent_Code"] = matches["Opponent"].astype("category").cat.codes #does similar thing to give an int value to the opponent 
matches["Hour"] = matches["Time"].str.replace(":.+","",regex=True).astype("int") #regex to get rid fo the :xx minutes in the time column, accout for the time of day the matche is being played, maybe some teams play better at diffrent times
matches["Day_Code"] = matches["Date"].dt.dayofweek #converts day of week into an int we can use for ML
# matches["Day_Codev2check"] = matches["Day"].astype("category").cat.codes -- other way which gives different nums

matches["Target"] = (matches["Result"] == "W").astype("int") #target is if the team won or did not win, which is what we will predict

from sklearn.ensemble import RandomForestClassifier #random forest is type of ML model that can pick up non linearities in the model
#ex Opponent Code is a non linear relationship, so random forest can pick that up

rf = RandomForestClassifier(n_estimators=1000,min_samples_split=300,random_state=1) #n_estimators is how many decision trees in the forst
#min_samples_split is how many samples in leaf of decision tree before we split the node (higher num for this means less likely to overfit, but less accurate training data will be)
#random_state=1 means we get the same result as long as the data is the same 
train = matches[matches["Date"] < "2022-07-01"] #split into training and testing data
test = matches[matches["Date"] > "2022-07-01"] #do this so our model can be good at predicting the future matches, not just good at pridicting the same training data

predictors = ["Venue_Code", "Opponent_Code", "Hour", "Day_Code"] #things we use to predict a match
rf.fit(train[predictors],train["Target"]) #trains a random forest model with the predicts to try and predict the target
preds = rf.predict(test[predictors])

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test["Target"],preds) #gives our current model accuracy

combined = pd.DataFrame(dict(actual=test["Target"],prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

from sklearn.metrics import precision_score #can tell us when we predicted a win, what % did the team actually win
precision_score(test["Target"],preds)

grouped_matches = matches.groupby("Team") #groups matches by team so that we can compute rolling averages

def rolling_avs(group, cols, new_cols): #rolling averages function that takes in a group, a set of columns to computing rolling averages for, and a set of new columns we assign the rolling averages to
  group = group.sort_values("Date") #sort by date so the matches are in chronological order
  rolling_stats = group[cols].rolling(3,closed="left").mean() #closed="left" ignored the current week wehn computing rolling averages
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols) #drop missing values -- rolling av for week 2 will have missing values which we need to get rid of
  return group 

cols = ["GF","GA","Sh","SoT","Dist","FK","PK","PKatt","xG","xGA"]
new_cols = [f"{c}_rolling" for c in cols]

matches_rolling = matches.groupby("Team").apply(lambda x: rolling_avs(x,cols,new_cols)) #applied rolling averages function to each team dataframe
matches_rolling = matches_rolling.droplevel("Team") #drops extra index that is redundant, so we only need 1 index to call a row and not 2
matches_rolling.index = range(matches_rolling.shape[0]) #gives unique index values for the rows

def make_predictions(data,predictors):
  train = data[data["Date"] < "2023-07-01"]
  test = data[data["Date"] > "2023-07-01"]
  rf.fit(train[predictors],train["Target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["Target"],prediction=preds), index = test.index)
  precision = precision_score(test["Target"],preds)
  return combined, precision

combined, precision = make_predictions(matches_rolling, predictors + new_cols)

combined = combined.merge(matches_rolling[["Date","Team","Opponent","Result"]], left_index=True, right_index=True) #finds the correctponsing index in matches_rolling and merges the stuff from that row into combined

class MissingDict(dict): #inheriting from the dictionary 
  __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",
    "Nottingham Forest": "Nott'ham Forest"
}

mapping = MissingDict(**map_values) #returns same if not in map_values, else returns the value (avoid returning missing values)

combined["New_Team"] = combined["Team"].map(mapping)
merged = combined.merge(combined, left_on=["Date","New_Team"],right_on=["Date","Opponent"]) #merges with itself, looks for new_team field in another row which has that same field but in the opponent field
win_preds = merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts() #rows with team x predicted to win and y predicted to lose -- algo has most confidence in those
loss_preds = merged[(merged["prediction_x"] == 0) & (merged["prediction_y"] == 1)]["actual_x"].value_counts()
accuracy = (loss_preds.get(0) + win_preds.get(1))/(loss_preds.get(0) + loss_preds.get(1) + win_preds.get(0) + win_preds.get(1))
print(f"Final Accuracy: {accuracy}%")