#**Preparing Data for Machine Learning**
All of the data is transformed to integer values which can be used for machine learning.

In [1]:
import pandas as pd

matches = pd.read_csv("matches.csv")

In [2]:
matches["date"] = pd.to_datetime(matches["date"])

In [3]:
matches["location_code"] = matches["unnamed: 3_level_1"].notna().astype(int) # 1 for away, 0 for home
matches["opp_code"] = matches["opp"].astype("category").cat.codes # assign a number to each team
matches["team_code"] = matches["team"].astype("category").cat.codes # assign a number to each team
matches["ot_code"] = matches["ot"].notna().astype(int) # 1 for overtime/shoutout finish, 0 for regulation finish
matches["target"] = (matches["rslt"] == "W").astype("int") # 1 for win, 0 for loss

# Sort so calculation works correctly
matches = matches.sort_values(["team", "date"])

# Check if the teams are playing back to back games
matches["back2back_code"] = (
    matches.groupby("team")["date"]
    .diff()   # difference with previous row
    .dt.days
    .eq(1)    # True if exactly 1 day difference
    .astype(int)  # Convert True/False → 1/0
)

matches = matches.sort_values(["opp", "date"])
matches["back2back_opp_code"] = (
    matches.groupby("opp")["date"]
    .diff()   # difference with previous row
    .dt.days
    .eq(1)    # True if exactly 1 day difference
    .astype(int)  # Convert True/False → 1/0
)

list_columns_to_convert = ["gf", "ga", "sog_for", "pim_for", "ppg_for", "ppo_for", "shg_for", "fow", "fol", "sog_against", "pim_against", "ppg_against", "ppo_against", "shg_against"]
for column in list_columns_to_convert:
  matches[column] = pd.to_numeric(matches[column], errors="coerce")
matches

Unnamed: 0.1,Unnamed: 0,rk,gtm,date,unnamed: 3_level_1,opp,rslt,gf,ga,ot,...,pdo,season,team,location_code,opp_code,team_code,ot_code,target,back2back_code,back2back_opp_code
43176,46871,1,1,2006-10-06,@,ANA,L,3,4,,...,,2007,LAK,1,0,12,0,0,0,0
42276,45894,3,3,2006-10-09,@,ANA,L,0,2,,...,,2007,STL,1,0,24,0,0,0,0
43671,47408,4,4,2006-10-11,@,ANA,W,5,4,SO,...,,2007,NYI,1,0,17,1,1,1,0
43590,47320,5,5,2006-10-15,@,ANA,W,4,3,SO,...,,2007,DAL,1,0,8,1,1,1,0
42853,46520,6,6,2006-10-18,@,ANA,L,1,4,,...,,2007,DET,1,0,9,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,528,78,78,2025-04-10,@,WSH,L,4,5,SO,...,97.3,2025,CAR,1,34,3,1,0,0,0
2620,2843,79,79,2025-04-12,,WSH,W,7,0,,...,129.2,2025,CBJ,0,34,4,0,1,0,0
2621,2844,80,80,2025-04-13,@,WSH,W,4,1,,...,117.8,2025,CBJ,1,34,4,0,1,1,1
1720,1866,81,81,2025-04-15,,WSH,L,1,3,,...,96.0,2025,NYI,0,34,17,0,0,0,0


In [4]:
matches.dtypes

Unnamed: 0,0
Unnamed: 0,int64
rk,int64
gtm,int64
date,datetime64[ns]
unnamed: 3_level_1,object
opp,object
rslt,object
gf,int64
ga,int64
ot,object


#**Building Machine Learning Model**
Using RandomForestClassifier to build a model based off of the nhl game data.

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=600, max_depth=None, min_samples_split=20, min_samples_leaf=3, class_weight="balanced_subsample", random_state=1)
train = matches[matches["date"] < '2024-10-04']
test = matches[matches["date"] >= '2024-10-04']
predictors = ["opp_code", "location_code", "back2back_code", "back2back_opp_code", "team_code"]
rf.fit(train[predictors], train["target"])
preds = rf.predict(test[predictors])

#**Test the Model's Performance**
Test the model using sklearn metrics such as the accuracy score and the precision score.

In [6]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["target"], preds)
print(acc)

combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

from sklearn.metrics import precision_score
precision_score(test["target"], preds)

0.5411585365853658


0.541095890410959

#**Improving the Model**
Improving the accuracy of the model by adding additional data collumns which use the rolling averages for different game metrics:


* gf : Goals For (number of goals the team scored in that game).

* ga : Goals Against (number of goals the opponent scored).

* sog : Shots on Goal (shots directed at the net).

* pim : Penalty Minutes (total minutes spent in the penalty box by the team).

* ppg : Power Play Goals (goals scored while the team had a man advantage).

* ppo : Power Play Opportunities (number of scoring chances the team had on the power play).

* shg : Short-Handed Goals (goals scored while the team was down a man due to a penalty).

* fow : Faceoffs Won (number of faceoffs the team won).

* fol : Faceoffs Lost (number of faceoffs the team lost).

* ot_code : Overtime Code (game did not end in regulation, decided in overtime, or shootout).


In [7]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("MTL")

def rolling_averages(group, cols, new_cols, window = 6): # window is the number of games from the past to use
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(window, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

cols = ["gf", "ga", "sog_for", "pim_for", "ppg_for", "ppo_for", "shg_for", "fow", "fol", "ot_code", "sog_against", "pim_against", "ppg_against", "ppo_against", "shg_against"]
new_cols = [f"{c}_rolling" for c in cols]

new_cols

rolling_averages(group, cols, new_cols)

matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling = matches_rolling.droplevel("team")

matches_rolling.index = range(matches_rolling.shape[0])

matches_rolling

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0.1,Unnamed: 0,rk,gtm,date,unnamed: 3_level_1,opp,rslt,gf,ga,ot,...,ppo_for_rolling,shg_for_rolling,fow_rolling,fol_rolling,ot_code_rolling,sog_against_rolling,pim_against_rolling,ppg_against_rolling,ppo_against_rolling,shg_against_rolling
0,24274,7,7,2015-10-24,@,MIN,L,0,3,,...,3.000000,0.000000,31.000000,28.166667,0.166667,31.166667,8.333333,0.333333,2.833333,0.000000
1,24275,8,8,2015-10-26,@,CHI,L,0,1,OT,...,3.000000,0.000000,30.000000,30.000000,0.166667,28.833333,8.333333,0.333333,3.500000,0.166667
2,24276,9,9,2015-10-27,@,DAL,L,3,4,,...,3.166667,0.000000,32.500000,30.333333,0.166667,28.666667,9.333333,0.333333,3.666667,0.166667
3,24277,10,10,2015-10-29,@,STL,L,1,2,,...,3.333333,0.000000,32.000000,31.166667,0.166667,29.500000,9.666667,0.333333,3.166667,0.333333
4,24279,11,11,2015-11-01,,NSH,W,4,2,,...,3.166667,0.000000,31.000000,32.333333,0.166667,31.000000,9.833333,0.333333,3.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24203,2486,78,78,2025-04-10,,CAR,W,5,4,SO,...,2.500000,0.333333,28.500000,27.833333,0.000000,23.833333,16.166667,1.166667,3.500000,0.000000
24204,2487,79,79,2025-04-12,@,CBJ,L,0,7,,...,2.666667,0.333333,26.333333,30.166667,0.166667,26.000000,17.333333,1.000000,3.333333,0.000000
24205,2488,80,80,2025-04-13,,CBJ,L,1,4,,...,2.666667,0.166667,27.500000,29.833333,0.166667,26.500000,21.000000,0.666667,2.833333,0.000000
24206,2489,81,81,2025-04-15,@,NYI,W,3,1,,...,2.666667,0.000000,30.500000,29.500000,0.166667,26.333333,21.000000,0.500000,2.833333,0.000000


In [8]:
def make_predictions(data, predictors):
  train = data[data["date"] < '2024-10-04'] # Use the past seasons to train the data
  test = data[data["date"] >= '2024-10-04'] # Test with the most recent season
  rf.fit(train[predictors], train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
  precision = precision_score(test["target"], preds)
  return combined, precision

#**Test the Improved Model's Performance**

In [9]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision

0.5851063829787234

In [10]:
combined = combined.merge(matches_rolling[["date", "team", "opp", "rslt"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opp,rslt
695,1,0,2024-10-12,ANA,SJS,W
696,0,0,2024-10-13,ANA,VEG,L
697,1,0,2024-10-16,ANA,UTA,W
698,0,0,2024-10-18,ANA,COL,L
699,0,0,2024-10-20,ANA,LAK,L
...,...,...,...,...,...,...
24203,1,1,2025-04-10,WSH,CAR,W
24204,0,0,2025-04-12,WSH,CBJ,L
24205,0,1,2025-04-13,WSH,CBJ,L
24206,1,0,2025-04-15,WSH,NYI,W


In [11]:
merged = combined.merge(combined, left_on=["date", "team"], right_on=["date", "opp"])
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opp_x,rslt_x,actual_y,predicted_y,team_y,opp_y,rslt_y
0,1,0,2024-10-12,ANA,SJS,W,0,0,SJS,ANA,L
1,0,0,2024-10-13,ANA,VEG,L,1,1,VEG,ANA,W
2,1,0,2024-10-16,ANA,UTA,W,0,1,UTA,ANA,L
3,0,0,2024-10-18,ANA,COL,L,1,1,COL,ANA,W
4,0,0,2024-10-20,ANA,LAK,L,1,1,LAK,ANA,W
...,...,...,...,...,...,...,...,...,...,...,...
2619,1,1,2025-04-10,WSH,CAR,W,0,1,CAR,WSH,L
2620,0,0,2025-04-12,WSH,CBJ,L,1,0,CBJ,WSH,W
2621,0,1,2025-04-13,WSH,CBJ,L,1,0,CBJ,WSH,W
2622,1,0,2025-04-15,WSH,NYI,W,0,0,NYI,WSH,L


In [12]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

Unnamed: 0_level_0,count
actual_x,Unnamed: 1_level_1
1,521
0,313


In [13]:
# This shows the model's accuracy when it predicts that the home team is going to win
# and that the away team will lose (for the same game when both perspectives align).
(521/(521+313))

0.6247002398081535

In [14]:
merged.to_csv("predictions.csv")