In [2]:
import pandas as pd

In [4]:
matches = pd.read_csv("matches.csv", index_col=0)

In [4]:
matches["Team"].value_counts()

Team
penrith                91
melbourne              90
parramatta             90
south-sydney           90
brisbane               89
sydney                 89
cronulla               89
warriors               89
canberra               89
newcastle              88
north-qld              88
st-george-illawarra    87
manly                  86
gold-coast             86
canterbury             86
wests-tigers           86
dolphins               60
Name: count, dtype: int64

In [5]:
matches = matches.drop(0)

In [5]:
matches["Season"].value_counts()

Season
2023    528
2024    509
2022    436
Name: count, dtype: int64

In [6]:
matches["Season"]

0     2024
1     2024
2     2024
3     2024
4     2024
      ... 
21    2022
22    2022
23    2022
24    2022
25    2022
Name: Season, Length: 1473, dtype: int64

In [57]:
matches.dtypes

Date           object
Day            object
Competition    object
Unnamed: 3     object
Opposition     object
Unnamed: 5     object
Score          object
Score.1        object
Score.2        object
Captain(s)     object
Season          int32
Team           object
dtype: object

In [6]:
def fill_missing_months(dates):
    last_month = None
    filled_dates = []
    for date in dates:
        if len(date.split()) == 2:
            last_month = date.split()[0]
            filled_dates.append(date)
        else:
            filled_dates.append(f"{last_month} {date}")
    return filled_dates

In [7]:
matches['Date'] = matches['Date'].astype(str)

In [8]:
matches["Date"] = fill_missing_months(matches['Date'])

In [9]:
matches=matches.dropna(subset=["Day"])
matches.head()

Unnamed: 0,Date,Day,Competition,Unnamed: 3,Opposition,Unnamed: 5,Score,Score.1,Score.2,Captain(s),Season,Team
1,Feb 15,Thu 7:00pm,NPC (1),A *,Canterbury,L,12,-,24,,2024,melbourne
2,Feb 24,Sat 2:45pm,NPC (2),H *,Newcastle,W,28,-,10,H Grant,2024,melbourne
3,Mar 8,Fri 8:05pm,Round 1,H,Penrith,W,8,-,0,H Grant,2024,melbourne
4,Mar 16,Sat 7:35pm,Round 2,H,Warriors,W,30,-,26,H Grant,2024,melbourne
5,Mar 24,Sun 6:15pm,Round 3,A,Newcastle,L,12,-,14,H Grant,2024,melbourne


In [10]:
matches["Date"] = matches["Date"] + " " + matches["Season"].astype(str)

In [11]:
matches["Date"] = pd.to_datetime(matches["Date"], format='%b %d %Y')

In [12]:
matches.dtypes

Date           datetime64[ns]
Day                    object
Competition            object
Unnamed: 3             object
Opposition             object
Unnamed: 5             object
Score                  object
Score.1                object
Score.2                object
Captain(s)             object
Season                  int64
Team                   object
dtype: object

In [13]:
matches["venue_code"] = matches["Unnamed: 3"].astype("category").cat.codes

In [14]:
matches["Unnamed: 3"] = matches["Unnamed: 3"].replace("A *", "A")
matches["Unnamed: 3"] = matches["Unnamed: 3"].replace("H *", "H")

In [15]:
matches["opp_code"] = matches["Opposition"].astype("category").cat.codes

In [16]:
matches["Day"] = matches["Day"].str[3:]

In [17]:
matches["Hour"] = matches["Day"].str.replace(":.+", "", regex=True)

In [18]:
matches["Hour"] = pd.to_numeric(matches["Hour"], errors='coerce')

In [19]:
matches["Hour"] = matches["Hour"].replace("", "NAN")
matches=matches.dropna(subset=["Hour"])

In [20]:
matches=matches.dropna(subset=["Hour"])

In [21]:
unique_hours = matches["Hour"].unique()
print(unique_hours)

[7. 2. 8. 6. 5. 4. 3. 1.]


In [22]:
matches["Hour"] = matches["Hour"].astype(int)

In [23]:
matches.dtypes

Date           datetime64[ns]
Day                    object
Competition            object
Unnamed: 3             object
Opposition             object
Unnamed: 5             object
Score                  object
Score.1                object
Score.2                object
Captain(s)             object
Season                  int64
Team                   object
venue_code               int8
opp_code                 int8
Hour                    int32
dtype: object

In [24]:
matches

Unnamed: 0,Date,Day,Competition,Unnamed: 3,Opposition,Unnamed: 5,Score,Score.1,Score.2,Captain(s),Season,Team,venue_code,opp_code,Hour
1,2024-02-15,7:00pm,NPC (1),A,Canterbury,L,12,-,24,,2024,melbourne,1,2,7
2,2024-02-24,2:45pm,NPC (2),H,Newcastle,W,28,-,10,H Grant,2024,melbourne,3,8,2
3,2024-03-08,8:05pm,Round 1,H,Penrith,W,8,-,0,H Grant,2024,melbourne,2,11,8
4,2024-03-16,7:35pm,Round 2,H,Warriors,W,30,-,26,H Grant,2024,melbourne,2,16,7
5,2024-03-24,6:15pm,Round 3,A,Newcastle,L,12,-,14,H Grant,2024,melbourne,0,8,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,2022-08-07,4:05pm,Round 21,H,Newcastle,L,10,-,14,J Tamou,2022,wests-tigers,2,8,4
22,2022-08-13,5:30pm,Round 22,H,Cronulla,L,12,-,36,J Tamou,2022,wests-tigers,3,3,5
23,2022-08-20,7:35pm,Round 23,A,Sydney,L,6,-,72,J Tamou,2022,wests-tigers,0,15,7
24,2022-08-28,2:00pm,Round 24,H,St Geo Illa,L,22,-,24,A Doueihi,2022,wests-tigers,2,13,2


In [25]:
matches["Day_Code"] = matches["Date"].dt.dayofweek

In [26]:
matches.rename(columns={"Unnamed: 3": "Home", "Unnamed: 5" : "Result"}, inplace=True)

In [27]:
matches["target"] = (matches["Result"] == "W").astype("int")

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [30]:
train = matches[matches["Date"] < '2023-05-01']

In [31]:
test = matches[matches["Date"] > '2023-05-01']

In [164]:
predictors = ["venue_code", "opp_code", "Hour", "Day_Code"]

In [33]:
rf.fit(train[predictors], train["target"])

In [34]:
preds = rf.predict(test[predictors])

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
acc = accuracy_score(test["target"], preds)

In [37]:
acc

0.5369595536959554

In [38]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [39]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,239,188
1,144,146


In [40]:
from sklearn.metrics import precision_score

In [41]:
precision_score(test["target"], preds)

0.437125748502994

In [45]:
matches["Score"] = pd.to_numeric(matches["Score"], errors='coerce')
matches["Score.2"] = pd.to_numeric(matches["Score.2"], errors='coerce')

In [46]:
matches['Score'] = matches['Score'].replace("-", "pd.NA")
matches['Score.2'] = matches['Score.2'].replace("-", "pd.NA")


In [53]:
matches=matches.dropna(subset=["Score"])
matches=matches.dropna(subset=["Score.2"])

In [48]:
matches["Score"] = matches["Score"].astype(int)
matches["Score.2"] = matches["Score.2"].astype(int)

In [49]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(3,closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [50]:
cols = ["Score", "Score.2"]
new_cols = [f"{c}_rolling" for c in cols]

In [51]:
new_cols

['Score_rolling', 'Score.2_rolling']

In [55]:
grouped_matches = matches.groupby("Team")

In [56]:
group = grouped_matches.get_group("manly")

In [75]:
new_cols

['Score_rolling', 'Score.2_rolling']

In [101]:
matches["Score_rolling"] = matches_rolling["Score_rolling"]
matches["Score.2_rolling"] = matches_rolling["Score.2_rolling"]

In [59]:
group

Unnamed: 0,Date,Day,Competition,Home,Opposition,Result,Score,Score.1,Score.2,Captain(s),Season,Team,venue_code,opp_code,Hour,Day_Code,target
1,2024-02-17,3:45pm,NPC (1),A,Sydney,L,22,-,36,T Sipley,2024,manly,1,15,3,5,0
2,2024-02-24,5:55pm,NPC (2),H,Brisbane,L,14,-,40,T Sipley,2024,manly,3,0,5,5,0
3,2024-03-02,6:30pm,Round 1,H,Souths,W,36,-,24,D Cherry-Evans,2024,manly,3,12,6,5,1
4,2024-03-17,4:05pm,Round 2,H,Sydney,W,21,-,14,D Cherry-Evans,2024,manly,2,15,4,6,1
5,2024-03-24,4:05pm,Round 3,A,Parramatta,L,24,-,28,D Cherry-Evans,2024,manly,0,10,4,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,2022-08-05,7:55pm,Round 21,H,Parramatta,L,20,-,36,D Cherry-Evans,2022,manly,2,10,7,4,0
22,2022-08-14,4:05pm,Round 22,A,Gold Coast,L,24,-,44,D Cherry-Evans,2022,manly,0,5,4,6,0
23,2022-08-20,5:30pm,Round 23,H,Cronulla,L,6,-,40,D Cherry-Evans,2022,manly,2,3,5,5,0
24,2022-08-27,3:00pm,Round 24,A,Canberra,L,6,-,48,D Cherry-Evans,2022,manly,0,1,3,5,0


In [60]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,Date,Day,Competition,Home,Opposition,Result,Score,Score.1,Score.2,Captain(s),Season,Team,venue_code,opp_code,Hour,Day_Code,target,Score_rolling,Score.2_rolling
4,2022-04-02,5:30pm,Round 4,H,Canberra,W,25,-,6,D Cherry-Evans,2022,manly,3,1,5,5,1,10.333333,22.000000
5,2022-04-07,7:50pm,Round 5,A,Newcastle,W,30,-,6,D Cherry-Evans,2022,manly,0,8,7,3,1,16.666667,14.666667
6,2022-04-16,5:30pm,Round 6,H,Gold Coast,W,26,-,18,D Cherry-Evans,2022,manly,2,5,5,5,1,22.666667,8.000000
7,2022-04-21,7:50pm,Round 7,A,Cronulla,L,22,-,34,D Cherry-Evans,2022,manly,0,3,7,3,0,27.000000,10.000000
8,2022-04-29,7:55pm,Round 8,A,Souths,L,22,-,40,D Cherry-Evans,2022,manly,1,12,7,4,0,26.000000,19.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,2024-05-16,2:00pm,Round 15,H,St Geo Illa,W,30,-,14,D Cherry-Evans,2024,manly,2,13,2,3,1,23.333333,29.333333
13,2024-05-17,8:05pm,Round 11,H,Brisbane,L,12,-,13,D Cherry-Evans,2024,manly,3,0,8,4,0,25.333333,25.333333
18,2024-05-22,7:35pm,Round 16,A,Souths,L,0,-,14,L Brooks,2024,manly,0,12,7,2,0,22.000000,19.000000
14,2024-05-24,8:00pm,Round 12,H,Melbourne,W,26,-,20,D Cherry-Evans,2024,manly,2,7,8,4,1,14.000000,13.666667


In [91]:
new_cols

['Score_rolling', 'Score.2_rolling']

In [61]:
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [111]:
matches_rolling

Unnamed: 0,Date,Day,Competition,Home,Opposition,Result,Score,Score.1,Score.2,Captain(s),Season,Team,venue_code,opp_code,Hour,Day_Code,target,Score_rolling,Score.2_rolling
0,2022-04-02,3:00pm,Round 4,A,Warriors,L,6,-,20,A Reynolds,2022,brisbane,0,16,3,5,0,13.000000,17.333333
1,2022-04-08,7:55pm,Round 5,H,Sydney,L,20,-,24,A Reynolds,2022,brisbane,2,15,7,4,0,11.333333,22.666667
2,2022-04-15,7:55pm,Round 6,A,Penrith,L,12,-,40,A Reynolds,2022,brisbane,0,11,7,4,0,12.666667,27.333333
3,2022-04-22,7:55pm,Round 7,H,Canterbury,W,34,-,14,A Reynolds,2022,brisbane,2,2,7,4,1,12.666667,28.000000
4,2022-04-28,7:50pm,Round 8,H,Cronulla,W,16,-,7,A Reynolds,2022,brisbane,2,3,7,3,1,22.000000,26.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,2024-05-19,6:25pm,Round 11,H,Dolphins,L,12,-,24,A Koroisau,2024,wests-tigers,3,4,6,6,0,15.333333,28.666667
1112,2024-05-23,4:05pm,Round 16,H,Canberra,W,48,-,24,A Koroisau,2024,wests-tigers,2,1,4,3,1,14.666667,18.000000
1113,2024-05-24,6:00pm,Round 12,A,North Qld,L,28,-,42,A Koroisau,2024,wests-tigers,0,9,6,4,0,26.000000,19.333333
1114,2024-05-30,6:15pm,Round 17,A,Sydney,L,6,-,40,A Koroisau,2024,wests-tigers,0,15,6,3,0,29.333333,30.000000


In [63]:
matches_rolling = matches_rolling.droplevel("Team")

In [65]:
matches_rolling.index = range(matches_rolling.shape[0])

In [117]:
def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-05-01']
    test = data[data["Date"] > '2023-05-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [118]:
matches_rolling

Unnamed: 0,Date,Day,Competition,Home,Opposition,Result,Score,Score.1,Score.2,Captain(s),Season,Team,venue_code,opp_code,Hour,Day_Code,target,Score_rolling,Score.2_rolling
0,2022-04-02,3:00pm,Round 4,A,Warriors,L,6,-,20,A Reynolds,2022,brisbane,0,16,3,5,0,13.000000,17.333333
1,2022-04-08,7:55pm,Round 5,H,Sydney,L,20,-,24,A Reynolds,2022,brisbane,2,15,7,4,0,11.333333,22.666667
2,2022-04-15,7:55pm,Round 6,A,Penrith,L,12,-,40,A Reynolds,2022,brisbane,0,11,7,4,0,12.666667,27.333333
3,2022-04-22,7:55pm,Round 7,H,Canterbury,W,34,-,14,A Reynolds,2022,brisbane,2,2,7,4,1,12.666667,28.000000
4,2022-04-28,7:50pm,Round 8,H,Cronulla,W,16,-,7,A Reynolds,2022,brisbane,2,3,7,3,1,22.000000,26.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,2024-05-19,6:25pm,Round 11,H,Dolphins,L,12,-,24,A Koroisau,2024,wests-tigers,3,4,6,6,0,15.333333,28.666667
1112,2024-05-23,4:05pm,Round 16,H,Canberra,W,48,-,24,A Koroisau,2024,wests-tigers,2,1,4,3,1,14.666667,18.000000
1113,2024-05-24,6:00pm,Round 12,A,North Qld,L,28,-,42,A Koroisau,2024,wests-tigers,0,9,6,4,0,26.000000,19.333333
1114,2024-05-30,6:15pm,Round 17,A,Sydney,L,6,-,40,A Koroisau,2024,wests-tigers,0,15,6,3,0,29.333333,30.000000


In [165]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [126]:
precision

0.5749128919860628

In [122]:
combined = combined.merge(matches_rolling[["Date", "Team", "Opposition", "Result"]], left_index=True, right_index=True)

In [138]:
class MissingDict(dict):
    __missing__ = lambda self, key:key
    
map_values = {
    "brisbane": "Brisbane",
    "wests-tigers": "Wests Tigers",
    "canterbury": "Canterbury",
    "dolphins": "Dolphins",
    "manly": "Manly",
    "gold-coast": "Gold Coast",
    "melbourne": "Melbourne",
    "penrith": "Penrith",
    "sydney": "Sydney",
    "cronulla": "Cronulla",
    "north-qld": "North Qld",
    "newcastle": "Newcastle",
    "st-george-illawarra": "St Geo Illa",
    "canberra": "Canberra",
    "south-sydney": "South",
    "warriors": "Warriors",
    "parramatta": "Parramatta",
}
mapping = MissingDict(**map_values)

In [140]:
mapping["brisbane"]

'Brisbane'

In [142]:
combined["new_team"] = combined["Team"].map(mapping)

In [143]:
combined

Unnamed: 0,actual,prediction,Date,Team,Opposition,Result,new_team
32,1,0,2023-05-05,brisbane,Manly,W,Brisbane
33,0,0,2023-05-11,brisbane,Melbourne,L,Brisbane
34,0,0,2023-05-18,brisbane,Penrith,L,Brisbane
35,1,1,2023-05-27,brisbane,Warriors,W,Brisbane
36,1,0,2023-06-03,brisbane,Cronulla,W,Brisbane
...,...,...,...,...,...,...,...
1111,0,1,2024-05-19,wests-tigers,Dolphins,L,Wests Tigers
1112,1,1,2024-05-23,wests-tigers,Canberra,W,Wests Tigers
1113,0,1,2024-05-24,wests-tigers,North Qld,L,Wests Tigers
1114,0,0,2024-05-30,wests-tigers,Sydney,L,Wests Tigers


In [144]:
merged = combined.merge(combined, left_on=["Date", "new_team"], right_on=["Date", "Opposition"])

In [145]:
merged

Unnamed: 0,actual_x,prediction_x,Date,Team_x,Opposition_x,Result_x,new_team_x,actual_y,prediction_y,Team_y,Opposition_y,Result_y,new_team_y
0,1,0,2023-05-05,brisbane,Manly,W,Brisbane,0,0,manly,Brisbane,L,Manly
1,0,0,2023-05-11,brisbane,Melbourne,L,Brisbane,1,0,melbourne,Brisbane,W,Melbourne
2,0,0,2023-05-18,brisbane,Penrith,L,Brisbane,1,1,penrith,Brisbane,W,Penrith
3,1,1,2023-05-27,brisbane,Warriors,W,Brisbane,0,0,warriors,Brisbane,L,Warriors
4,1,0,2023-06-03,brisbane,Cronulla,W,Brisbane,0,0,cronulla,Brisbane,L,Cronulla
...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0,0,2024-05-11,wests-tigers,Newcastle,L,Wests Tigers,1,1,newcastle,Wests Tigers,W,Newcastle
476,1,1,2024-05-15,wests-tigers,Gold Coast,W,Wests Tigers,0,1,gold-coast,Wests Tigers,L,Gold Coast
477,0,1,2024-05-19,wests-tigers,Dolphins,L,Wests Tigers,1,1,dolphins,Wests Tigers,W,Dolphins
478,0,1,2024-05-24,wests-tigers,North Qld,L,Wests Tigers,1,1,north-qld,Wests Tigers,W,North Qld


In [146]:
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()

actual_x
1    86
0    53
Name: count, dtype: int64

In [147]:
86/(86+53)

0.6187050359712231