In [789]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [790]:
cricket_data = pd.read_csv("Cricket-all-teams-all-matches.csv")
cricket_data

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format
0,India,Pakistan,drawn,,Bengaluru,"Dec 8-12, 2007",Test # 1852,Test
1,India,Pakistan,drawn,,Eden Gardens,"Nov 30-Dec 4, 2007",Test # 1850,Test
2,India,Pakistan,India,6 wickets,Delhi,"Nov 22-26, 2007",Test # 1849,Test
3,Pakistan,India,Pakistan,341 runs,Karachi,"Jan 29-Feb 1, 2006",Test # 1783,Test
4,Pakistan,India,drawn,,Faisalabad,"Jan 21-25, 2006",Test # 1782,Test
...,...,...,...,...,...,...,...,...
7788,Australia,ICC World XI,Australia,210 runs,Sydney,"Oct 14-17, 2005",Test # 1768,Test
7789,Australia,ICC World XI,Australia,156 runs,Melbourne (Docklands),"Oct 9, 2005",ODI # 2284,ODI
7790,Australia,ICC World XI,Australia,55 runs,Melbourne (Docklands),"Oct 7, 2005",ODI # 2283,ODI
7791,Australia,ICC World XI,Australia,93 runs,Melbourne (Docklands),"Oct 5, 2005",ODI # 2282,ODI


In [791]:
# Filter for Test matches only (as the dataset covers other formats too)
test_data = cricket_data.loc[cricket_data['Format'] == 'Test ', :]


test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2581 entries, 0 to 7788
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Team 1      2581 non-null   object
 1   Team 2      2581 non-null   object
 2   Winner      2581 non-null   object
 3   Margin      1787 non-null   object
 4   Ground      2581 non-null   object
 5   Match Date  2581 non-null   object
 6   Scorecard   2581 non-null   object
 7   Format      2581 non-null   object
dtypes: object(8)
memory usage: 181.5+ KB


In [792]:
test_data.shape

(2581, 8)

In [793]:
#Removing Missing / Drawn Values
test_data = test_data.dropna()
test_data.shape

(1787, 8)

In [794]:
#Add Year Column
test_data['Year'] = test_data['Match Date'].str[-4:]
test_data.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
2,India,Pakistan,India,6 wickets,Delhi,"Nov 22-26, 2007",Test # 1849,Test,2007
3,Pakistan,India,Pakistan,341 runs,Karachi,"Jan 29-Feb 1, 2006",Test # 1783,Test,2006
6,India,Pakistan,Pakistan,168 runs,Bengaluru,"Mar 24-28, 2005",Test # 1743,Test,2005
7,India,Pakistan,India,195 runs,Eden Gardens,"Mar 16-20, 2005",Test # 1741,Test,2005
9,Pakistan,India,India,inns & 131 runs,Rawalpindi,"Apr 13-16, 2004",Test # 1697,Test,2004


In [795]:
#Re-formatting Scorecard to show just the number (we can sort it by date that way)
test_data['Scorecard'] = test_data['Scorecard'].apply(lambda x: int(x.replace('Test #', '')))
#Sorting by date (order of scorecard variable)
test_data = test_data.sort_values('Scorecard', ascending=True)
test_data.head(10)

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
3945,Australia,England,Australia,45 runs,Melbourne,"Mar 15-19, 1877",1,Test,1877
3944,Australia,England,England,4 wickets,Melbourne,"Mar 31-Apr 4, 1877",2,Test,1877
3943,Australia,England,Australia,10 wickets,Melbourne,"Jan 2-4, 1879",3,Test,1879
3942,England,Australia,England,5 wickets,The Oval,"Sep 6-8, 1880",4,Test,1880
3940,Australia,England,Australia,5 wickets,Sydney,"Feb 17-21, 1882",6,Test,1882
3939,Australia,England,Australia,6 wickets,Sydney,"Mar 3-7, 1882",7,Test,1882
3937,England,Australia,Australia,7 runs,The Oval,"Aug 28-29, 1882",9,Test,1882
3936,Australia,England,Australia,9 wickets,Melbourne,"Dec 30, 1882 - Jan 2, 1883",10,Test,1883
3935,Australia,England,England,inns & 27 runs,Melbourne,"Jan 19-22, 1883",11,Test,1883
3934,Australia,England,England,69 runs,Sydney,"Jan 26-30, 1883",12,Test,1883


In [796]:
#Filter Dataframe to just matches from 2000
test_data['Year'] = test_data['Year'].astype(int)
test_data = test_data[(test_data['Year'] >= 2000)]
test_data.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
1739,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1481,Test,2000
5898,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",1482,Test,2000
5897,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",1483,Test,2000
2583,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",1484,Test,2000
1149,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",1485,Test,2000


In [797]:
#Re-formatting the Scorecard to start from 1
i = 1
for index, row in test_data.iterrows():
    test_data.loc[index, 'Scorecard'] = i
    i = i + 1
#Renaming Scorecard to Match ID for clarity
test_data = test_data.rename(columns={'Scorecard': 'Match ID', 'Team 1': 'HomeTeam', 'Team 2': 'AwayTeam'})
test_data.head()

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year
1739,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000
5898,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000
5897,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000
2583,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000
1149,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000


In [798]:
# Create a DataFrame to store individual team results for easier rolling calculations
team_results_list = []
for index, row in test_data.iterrows():
    # Home Team's perspective
    team_results_list.append({
        'Year': row["Year"],
        'Match ID': row['Match ID'],
        'Team': row['HomeTeam'],
        'IsHome': 1,
        'Result': 1 if row['Winner'] == row['HomeTeam'] else 0 # 1 for win, 0 for loss
    })
    # Away Team's perspective
    team_results_list.append({
        'Year': row["Year"],
        'Match ID': row['Match ID'],
        'Team': row['AwayTeam'],
        'IsHome': 0,
        'Result': 1 if row['Winner'] == row['AwayTeam'] else 0 # 1 for win, 0 for loss
    })

team_results = pd.DataFrame(team_results_list)
team_results = team_results.sort_values(by=['Match ID']).reset_index(drop=True)
team_results.head(10)

Unnamed: 0,Year,Match ID,Team,IsHome,Result
0,2000,1,Australia,1,1
1,2000,1,India,0,0
2,2000,2,South Africa,1,1
3,2000,2,England,0,0
4,2000,3,South Africa,1,0
5,2000,3,England,0,1
6,2000,4,India,1,0
7,2000,4,South Africa,0,1
8,2000,5,Sri Lanka,0,1
9,2000,5,Pakistan,1,0


In [799]:
team_results['RollingForm'] = team_results.groupby('Team')['Result'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))

# Fill NaN values (for teams with fewer than 5 prior matches) with a default (0.5 for average form)
team_results['RollingForm'] = team_results['RollingForm'].fillna(0.5)

team_results

Unnamed: 0,Year,Match ID,Team,IsHome,Result,RollingForm
0,2000,1,Australia,1,1,0.5
1,2000,1,India,0,0,0.5
2,2000,2,South Africa,1,1,0.5
3,2000,2,England,0,0,0.5
4,2000,3,South Africa,1,0,1.0
...,...,...,...,...,...,...
1747,2025,874,Australia,0,1,0.8
1748,2025,875,Sri Lanka,1,0,0.4
1749,2025,875,Australia,0,1,0.8
1750,2025,876,Zimbabwe,1,0,0.0


In [800]:
home_form = team_results[team_results['IsHome'] == 1].rename(columns={'Team': 'HomeTeam', 'RollingForm': 'HomeTeamForm'})[['Match ID', 'HomeTeam', 'HomeTeamForm']]
away_form = team_results[team_results['IsHome'] == 0].rename(columns={'Team': 'AwayTeam', 'RollingForm': 'AwayTeamForm'})[['Match ID', 'AwayTeam', 'AwayTeamForm']]
home_form

Unnamed: 0,Match ID,HomeTeam,HomeTeamForm
0,1,Australia,0.5
2,2,South Africa,0.5
4,3,South Africa,1.0
6,4,India,0.0
9,5,Pakistan,0.5
...,...,...,...
1742,872,Pakistan,0.4
1744,873,Pakistan,0.6
1746,874,Sri Lanka,0.6
1748,875,Sri Lanka,0.4


In [801]:
#Merging above dataframes with the main dataframe (test_data)
test_data = pd.merge(test_data, home_form, on=['Match ID', 'HomeTeam'], how='left')
test_data = pd.merge(test_data, away_form, on=['Match ID', 'AwayTeam'], how='left')

test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8


In [802]:
test_data['home_advantage'] = 1 #always set it to 1 because home team is being predicted
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1


In [803]:
test_data['HomeTeamWin'] = (test_data['Winner'] == test_data['HomeTeam']).astype(int)
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0


In [804]:
#Calculating ELO
starting_ELO = 1000
home_boost = 30 #How many Elo points home advantage is worth

#Initialize Elo ratings for all unique teams
unique_team_names = pd.concat([test_data['HomeTeam'], test_data['AwayTeam']]).unique()
elo_ratings = {team: starting_ELO for team in unique_team_names}
unique_team_names

array(['Australia', 'South Africa', 'India', 'Pakistan', 'New Zealand',
       'West Indies', 'England', 'Sri Lanka', 'Zimbabwe', 'Bangladesh',
       'Ireland', 'Afghanistan', 'ICC World XI'], dtype=object)

In [805]:
games_played_count = {team: 0 for team in unique_team_names}
games_played_count

{'Australia': 0,
 'South Africa': 0,
 'India': 0,
 'Pakistan': 0,
 'New Zealand': 0,
 'West Indies': 0,
 'England': 0,
 'Sri Lanka': 0,
 'Zimbabwe': 0,
 'Bangladesh': 0,
 'Ireland': 0,
 'Afghanistan': 0,
 'ICC World XI': 0}

In [806]:
# Function to determine max ELO gain based on games played
def get_max_ELO_gain(games_played, recent_form):
    if games_played < 10:
        max_gain = 30 # Higher ELO gain for new teams
    elif games_played < 30:
        max_gain = 15 # Moderate ELO gain for developing teams
    else:
        max_gain = 5 # Lower ELO gain for established teams

    form_multiplier = 1.5 - recent_form

    max_gain = max_gain * form_multiplier

    return max(3, min(max_gain, 50))

In [807]:
#Lists to store Elo ratings for each match
home_team_elos_at_match = []
away_team_elos_at_match = []

In [808]:
#Iterate through matches to update ELO ratings
for index, row in test_data.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    winner = row['Winner']

    #Get current ELO ratings
    home_elo = elo_ratings[home_team]
    away_elo = elo_ratings[away_team]

    #Store ELO ratings *before* the match
    home_team_elos_at_match.append(home_elo)
    away_team_elos_at_match.append(away_elo)

    #Apply Home Advantage Boost
    home_elo = home_elo + home_boost

    # Calculate expected win probabilities
    # E_A = 1 / (1 + 10^((R_B - R_A) / 400))
    expected_home_win = 1 / (1 + 10**((away_elo - home_elo) / 400))
    expected_away_win = 1 / (1 + 10**((home_elo - away_elo) / 400))

    #Store the real winner
    score_home = 0
    score_away = 0

    if winner == home_team:
        score_home = 1
        score_away = 0
    elif winner == away_team:
        score_home = 0
        score_away = 1

    # Get dynamic ELO gain for each team based on their games played count
    ELO_gain_home = get_max_ELO_gain(games_played_count[home_team], row['HomeTeamForm'])
    ELO_gain_away = get_max_ELO_gain(games_played_count[away_team], row['AwayTeamForm'])


    #Update ELO ratings (we will make the rating change more pronounced if they win despite not being expected to)
    new_home_elo = home_elo + ELO_gain_home * (score_home - expected_home_win)
    new_away_elo = away_elo + ELO_gain_away * (score_away - expected_away_win)

    #Update ELO list
    elo_ratings[home_team] = new_home_elo
    elo_ratings[away_team] = new_away_elo

    # Increment games played count for both teams
    games_played_count[home_team] += 1
    games_played_count[away_team] += 1

#Add the ELO ratings to the dataframe
test_data['Home ELO'] = home_team_elos_at_match
test_data['Away ELO'] = away_team_elos_at_match

In [809]:
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1000.000000,1000.000000
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1000.000000,1000.000000
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1043.708005,986.291995
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,986.291995,1064.359815
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1000.000000,1000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2632.531730,3175.695167
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2667.759218,3169.517226
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,3724.849056,4778.836080
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,3754.836695,4778.845694


In [810]:
elo_ratings

{'Australia': 4778.857113198396,
 'South Africa': 4222.209757450245,
 'India': 4012.6432580313563,
 'Pakistan': 2697.4799674716364,
 'New Zealand': 3583.270432854897,
 'West Indies': 3169.920588026037,
 'England': 4915.265237283735,
 'Sri Lanka': 3784.8187506021804,
 'Zimbabwe': 1992.201786474903,
 'Bangladesh': 2767.2134256404247,
 'Ireland': 1144.418191277682,
 'Afghanistan': 1242.1143687533124,
 'ICC World XI': 999.9161687695103}

In [811]:
#Elo Difference (Home Team Elo - Away Team Elo)
test_data['elo_difference'] = test_data['Home ELO'] - test_data['Away ELO']
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1000.000000,1000.000000,0.000000
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1000.000000,1000.000000,0.000000
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1043.708005,986.291995,57.416010
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,986.291995,1064.359815,-78.067820
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1000.000000,1000.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2632.531730,3175.695167,-543.163437
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2667.759218,3169.517226,-501.758007
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,3724.849056,4778.836080,-1053.987025
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,3754.836695,4778.845694,-1024.009000


In [812]:
# Create one-hot encoded columns for HomeTeam
home_team_ohe = pd.get_dummies(test_data['HomeTeam'], prefix='HomeTeam', dtype=int)
# Create one-hot encoded columns for AwayTeam
away_team_ohe = pd.get_dummies(test_data['AwayTeam'], prefix='AwayTeam', dtype=int)

home_team_ohe

Unnamed: 0,HomeTeam_Afghanistan,HomeTeam_Australia,HomeTeam_Bangladesh,HomeTeam_England,HomeTeam_India,HomeTeam_Ireland,HomeTeam_New Zealand,HomeTeam_Pakistan,HomeTeam_South Africa,HomeTeam_Sri Lanka,HomeTeam_West Indies,HomeTeam_Zimbabwe
0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
871,0,0,0,0,0,0,0,1,0,0,0,0
872,0,0,0,0,0,0,0,1,0,0,0,0
873,0,0,0,0,0,0,0,0,0,1,0,0
874,0,0,0,0,0,0,0,0,0,1,0,0


In [813]:
# Ensure all unique teams are present in both OHE dataframes, fill missing with 0s
# This is crucial for consistent feature sets during training and prediction
all_teams_sorted = sorted(unique_team_names)
for team in all_teams_sorted:
    if f'HomeTeam_{team}' not in home_team_ohe.columns:
        home_team_ohe[f'HomeTeam_{team}'] = 0
    if f'AwayTeam_{team}' not in away_team_ohe.columns:
        away_team_ohe[f'AwayTeam_{team}'] = 0

# Sort columns alphabetically to ensure consistent order
home_team_ohe = home_team_ohe.reindex(sorted(home_team_ohe.columns), axis=1)
away_team_ohe = away_team_ohe.reindex(sorted(away_team_ohe.columns), axis=1)

In [814]:
#Create Interaction Features to add complexity
test_data['elo_diff_x_home_form'] = test_data['elo_difference'] * test_data['HomeTeamForm']
test_data['elo_diff_x_away_form'] = test_data['elo_difference'] * test_data['AwayTeamForm']
test_data['home_form_x_away_form'] = test_data['HomeTeamForm'] * test_data['AwayTeamForm']

test_data.head()

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference,elo_diff_x_home_form,elo_diff_x_away_form,home_form_x_away_form
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1000.0,1000.0,0.0,0.0,0.0,0.25
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1000.0,1000.0,0.0,0.0,0.0,0.25
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1043.708005,986.291995,57.41601,57.41601,0.0,0.0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,986.291995,1064.359815,-78.06782,-0.0,-39.03391,0.0
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1000.0,1000.0,0.0,0.0,0.0,0.25


In [815]:
#Merge OHE's with the rest of the features
numerical_features = test_data[['HomeTeamForm', 'AwayTeamForm', 'home_advantage', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']]

X = pd.concat([numerical_features, home_team_ohe, away_team_ohe], axis=1)

y = test_data['HomeTeamWin'] = (test_data['Winner'] == test_data['HomeTeam']).astype(int)

In [816]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")


Training set size: 700 samples
Testing set size: 176 samples


In [817]:
# Scale numerical features
scaler = StandardScaler()
X_train[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']] = scaler.fit_transform(X_train[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']])

X_test[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']] = scaler.transform(X_test[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']])

X_train.head(10)

Unnamed: 0,HomeTeamForm,AwayTeamForm,home_advantage,elo_difference,elo_diff_x_home_form,elo_diff_x_away_form,home_form_x_away_form,HomeTeam_Afghanistan,HomeTeam_Australia,HomeTeam_Bangladesh,...,AwayTeam_England,AwayTeam_ICC World XI,AwayTeam_India,AwayTeam_Ireland,AwayTeam_New Zealand,AwayTeam_Pakistan,AwayTeam_South Africa,AwayTeam_Sri Lanka,AwayTeam_West Indies,AwayTeam_Zimbabwe
570,-1.028671,0.365645,1,0.332003,-0.056077,0.554659,-0.589101,0,0,0,...,0,0,0,0,0,1,0,0,0,0
120,0.340608,0.365645,1,-0.211106,-0.313553,-0.03908,0.586303,0,0,0,...,0,0,1,0,0,0,0,0,0,0
639,0.340608,0.365645,1,-0.750817,-0.878136,-0.629105,0.586303,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,-1.713311,1.753857,1,-0.116922,-0.211407,0.061361,-1.176804,0,0,0,...,0,0,0,0,0,0,0,1,0,0
170,1.025248,0.365645,1,0.227136,0.26365,0.440017,1.174005,0,0,0,...,0,0,0,0,1,0,0,0,0,0
87,1.709888,1.753857,1,-0.487802,-0.864062,-0.614397,3.720714,0,0,0,...,0,0,0,0,0,0,0,1,0,0
325,-1.028671,-0.328461,1,-0.949442,-0.50291,-0.541609,-0.785002,0,0,0,...,1,0,0,0,0,0,0,0,0,0
238,0.340608,1.753857,1,-0.62062,-0.741938,-0.856396,1.761707,0,0,0,...,0,0,0,0,0,0,0,0,0,0
872,0.340608,-1.022567,1,-0.785651,-0.914575,-0.177283,-0.589101,0,0,0,...,0,0,0,0,0,0,0,0,1,0
282,1.025248,-1.022567,1,0.549646,0.713479,0.30931,-0.393201,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [818]:
model = LogisticRegression(random_state=1, solver = 'liblinear', C = 0.01)
model.fit(X_train, y_train)

In [819]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of Home Team winning (class 1)

accuracy = accuracy_score(y_test, y_pred)
training_accuracy = accuracy_score(y_train, model.predict(X_train))

print(f"Training Accuracy: {training_accuracy * 100:.2f}%")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Training Accuracy: 72.00%
Test Accuracy: 73.30%


In [820]:
model.coef_

array([[ 0.25010818, -0.19989951,  0.18390873,  0.23059125,  0.06738066,
         0.11450182, -0.00535097, -0.00071015,  0.14339075, -0.11816444,
         0.01452271,  0.        ,  0.13872198,  0.00452976,  0.05266633,
         0.01163168,  0.04940332,  0.0024749 , -0.02461438, -0.08994375,
        -0.00743929, -0.03240966,  0.0778239 ,  0.04680491,  0.00249887,
        -0.03397993,  0.00763892,  0.03402473, -0.07206547, -0.03376217,
         0.04476846,  0.08314061,  0.06686485]])

In [821]:
model.intercept_

array([0.18390873])

In [822]:
y_pred_proba

array([0.57600879, 0.55572127, 0.36970261, 0.65571511, 0.66757172,
       0.76716074, 0.70133423, 0.61709643, 0.48392385, 0.50996501,
       0.49316403, 0.83749482, 0.70393866, 0.66790907, 0.50778009,
       0.4922177 , 0.43784311, 0.45971263, 0.68375035, 0.66544493,
       0.3922392 , 0.53121084, 0.29269355, 0.45407009, 0.7140875 ,
       0.5047954 , 0.61996275, 0.82814917, 0.48736829, 0.45823867,
       0.4159792 , 0.72664217, 0.74894928, 0.74556623, 0.70942188,
       0.4998897 , 0.58753055, 0.71347774, 0.74169689, 0.80134657,
       0.57731005, 0.35225296, 0.54757562, 0.45740894, 0.73363381,
       0.31843787, 0.36029157, 0.70406153, 0.18211225, 0.65198526,
       0.67716128, 0.66016584, 0.7027824 , 0.77657978, 0.4329108 ,
       0.56612838, 0.80282944, 0.7708596 , 0.66313386, 0.69982596,
       0.40871966, 0.58689569, 0.61049205, 0.745835  , 0.57044552,
       0.51071488, 0.29108336, 0.56884718, 0.56991466, 0.37640614,
       0.75442691, 0.52338653, 0.6546608 , 0.66957722, 0.83344

In [823]:
def predict_match(home_team, away_team):

    #Calculating win probability for IND vs PAK
    home_elo_current = elo_ratings.get(f'{home_team}', starting_ELO)  # Get latest ELO, or initial if not found
    away_elo_current = elo_ratings.get(f'{away_team}', starting_ELO)

    print(f"{home_team} (ELO: {home_elo_current:.0f}) vs {away_team} (ELO: {away_elo_current:.0f})")

    #Getting their recent form
    home_team_home_games = test_data[test_data['HomeTeam'] == f'{home_team}'].sort_values(by='Match ID', ascending=False)
    home_recent_form = home_team_home_games['HomeTeamForm'].iloc[0]

    away_team_home_games = test_data[test_data['AwayTeam'] == f'{away_team}'].sort_values(by='Match ID', ascending=False)
    away_recent_form = away_team_home_games['AwayTeamForm'].iloc[0]

    print(f"{home_team} Recent Form: Won{home_recent_form * 5: .0f} of last 5 home games")
    print(f"{away_team} Recent Form: Won{away_recent_form * 5: .0f} of last 5 away games")

    #Calculate features for this new match
    new_elo_difference = home_elo_current - away_elo_current
    new_home_advantage = 1
    new_recent_form_home = home_recent_form
    new_recent_form_away = away_recent_form

    #New Interaction Features
    new_elo_diff_x_home_form = new_elo_difference * new_recent_form_home
    new_elo_diff_x_away_form = new_elo_difference * new_recent_form_away
    new_home_form_x_away_form = new_recent_form_home * new_recent_form_away

    #Create a DataFrame for prediction
    numeric_features = pd.DataFrame(
        [[new_recent_form_home, new_recent_form_away, new_home_advantage, new_elo_difference, new_elo_diff_x_home_form, new_elo_diff_x_away_form, new_home_form_x_away_form]],
        columns=['HomeTeamForm', 'AwayTeamForm', 'home_advantage', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form'])

    numeric_features[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']] = scaler.transform(numeric_features[['HomeTeamForm', 'AwayTeamForm', 'elo_difference', 'elo_diff_x_home_form', 'elo_diff_x_away_form', 'home_form_x_away_form']])

    #Add OHE features
    #Set all teams to 0
    new_home_team_ohe = pd.DataFrame(0, index=[0], columns=[f'HomeTeam_{team}' for team in all_teams_sorted])
    new_away_team_ohe = pd.DataFrame(0, index=[0], columns=[f'AwayTeam_{team}' for team in all_teams_sorted])

    if f'HomeTeam_{home_team}' in new_home_team_ohe.columns:
        new_home_team_ohe[f'HomeTeam_{home_team}'] = 1
    if f'AwayTeam_{away_team}' in new_away_team_ohe.columns:
        new_away_team_ohe[f'AwayTeam_{away_team}'] = 1

    # Combine all features for prediction
    new_match_features = pd.concat([numeric_features, new_home_team_ohe, new_away_team_ohe], axis=1)


    #Predict
    prob_home_win = model.predict_proba(new_match_features)[:, 1][0]
    outcome = f"{home_team} Wins" if prob_home_win > 0.5 else f"{away_team} Wins"

    print(f"\nPredicted probability of {home_team} winning: {prob_home_win * 100:.2f}%")
    print(f"Predicted probability of {away_team} winning: {100 - prob_home_win * 100:.2f}%")
    print(f"Predicted outcome: {outcome}")

In [824]:
predict_match('England', 'Australia')

England (ELO: 4915) vs Australia (ELO: 4779)
England Recent Form: Won 5 of last 5 home games
Australia Recent Form: Won 4 of last 5 away games

Predicted probability of England winning: 64.85%
Predicted probability of Australia winning: 35.15%
Predicted outcome: England Wins
