In [374]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [375]:
cricket_data = pd.read_csv("Cricket-all-teams-all-matches.csv")
cricket_data

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format
0,India,Pakistan,drawn,,Bengaluru,"Dec 8-12, 2007",Test # 1852,Test
1,India,Pakistan,drawn,,Eden Gardens,"Nov 30-Dec 4, 2007",Test # 1850,Test
2,India,Pakistan,India,6 wickets,Delhi,"Nov 22-26, 2007",Test # 1849,Test
3,Pakistan,India,Pakistan,341 runs,Karachi,"Jan 29-Feb 1, 2006",Test # 1783,Test
4,Pakistan,India,drawn,,Faisalabad,"Jan 21-25, 2006",Test # 1782,Test
...,...,...,...,...,...,...,...,...
7788,Australia,ICC World XI,Australia,210 runs,Sydney,"Oct 14-17, 2005",Test # 1768,Test
7789,Australia,ICC World XI,Australia,156 runs,Melbourne (Docklands),"Oct 9, 2005",ODI # 2284,ODI
7790,Australia,ICC World XI,Australia,55 runs,Melbourne (Docklands),"Oct 7, 2005",ODI # 2283,ODI
7791,Australia,ICC World XI,Australia,93 runs,Melbourne (Docklands),"Oct 5, 2005",ODI # 2282,ODI


In [376]:
# Filter for Test matches only (as the dataset covers other formats too)
test_data = cricket_data.loc[cricket_data['Format'] == 'Test ', :]


test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2581 entries, 0 to 7788
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Team 1      2581 non-null   object
 1   Team 2      2581 non-null   object
 2   Winner      2581 non-null   object
 3   Margin      1787 non-null   object
 4   Ground      2581 non-null   object
 5   Match Date  2581 non-null   object
 6   Scorecard   2581 non-null   object
 7   Format      2581 non-null   object
dtypes: object(8)
memory usage: 181.5+ KB


In [377]:
test_data.shape

(2581, 8)

In [378]:
#Removing Missing / Drawn Values
test_data = test_data.dropna()
test_data.shape

(1787, 8)

In [379]:
#Add Year Column
test_data['Year'] = test_data['Match Date'].str[-4:]
test_data.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
2,India,Pakistan,India,6 wickets,Delhi,"Nov 22-26, 2007",Test # 1849,Test,2007
3,Pakistan,India,Pakistan,341 runs,Karachi,"Jan 29-Feb 1, 2006",Test # 1783,Test,2006
6,India,Pakistan,Pakistan,168 runs,Bengaluru,"Mar 24-28, 2005",Test # 1743,Test,2005
7,India,Pakistan,India,195 runs,Eden Gardens,"Mar 16-20, 2005",Test # 1741,Test,2005
9,Pakistan,India,India,inns & 131 runs,Rawalpindi,"Apr 13-16, 2004",Test # 1697,Test,2004


In [380]:
#Re-formatting Scorecard to show just the number (we can sort it by date that way)
test_data['Scorecard'] = test_data['Scorecard'].apply(lambda x: int(x.replace('Test #', '')))
#Sorting by date (order of scorecard variable)
test_data = test_data.sort_values('Scorecard', ascending=True)
test_data.head(10)

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
3945,Australia,England,Australia,45 runs,Melbourne,"Mar 15-19, 1877",1,Test,1877
3944,Australia,England,England,4 wickets,Melbourne,"Mar 31-Apr 4, 1877",2,Test,1877
3943,Australia,England,Australia,10 wickets,Melbourne,"Jan 2-4, 1879",3,Test,1879
3942,England,Australia,England,5 wickets,The Oval,"Sep 6-8, 1880",4,Test,1880
3940,Australia,England,Australia,5 wickets,Sydney,"Feb 17-21, 1882",6,Test,1882
3939,Australia,England,Australia,6 wickets,Sydney,"Mar 3-7, 1882",7,Test,1882
3937,England,Australia,Australia,7 runs,The Oval,"Aug 28-29, 1882",9,Test,1882
3936,Australia,England,Australia,9 wickets,Melbourne,"Dec 30, 1882 - Jan 2, 1883",10,Test,1883
3935,Australia,England,England,inns & 27 runs,Melbourne,"Jan 19-22, 1883",11,Test,1883
3934,Australia,England,England,69 runs,Sydney,"Jan 26-30, 1883",12,Test,1883


In [381]:
#Filter Dataframe to just matches from 2000
test_data['Year'] = test_data['Year'].astype(int)
test_data = test_data[(test_data['Year'] >= 2000)]
test_data.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Format,Year
1739,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1481,Test,2000
5898,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",1482,Test,2000
5897,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",1483,Test,2000
2583,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",1484,Test,2000
1149,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",1485,Test,2000


In [382]:
#Re-formatting the Scorecard to start from 1
i = 1
for index, row in test_data.iterrows():
    test_data.loc[index, 'Scorecard'] = i
    i = i + 1
#Renaming Scorecard to Match ID for clarity
test_data = test_data.rename(columns={'Scorecard': 'Match ID', 'Team 1': 'HomeTeam', 'Team 2': 'AwayTeam'})
test_data.head()

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year
1739,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000
5898,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000
5897,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000
2583,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000
1149,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000


In [383]:
# Create a DataFrame to store individual team results for easier rolling calculations
team_results_list = []
for index, row in test_data.iterrows():
    # Home Team's perspective
    team_results_list.append({
        'Year': row["Year"],
        'Match ID': row['Match ID'],
        'Team': row['HomeTeam'],
        'IsHome': 1,
        'Result': 1 if row['Winner'] == row['HomeTeam'] else 0 # 1 for win, 0 for loss
    })
    # Away Team's perspective
    team_results_list.append({
        'Year': row["Year"],
        'Match ID': row['Match ID'],
        'Team': row['AwayTeam'],
        'IsHome': 0,
        'Result': 1 if row['Winner'] == row['AwayTeam'] else 0 # 1 for win, 0 for loss
    })

team_results = pd.DataFrame(team_results_list)
team_results = team_results.sort_values(by=['Match ID']).reset_index(drop=True)
team_results.head(10)

Unnamed: 0,Year,Match ID,Team,IsHome,Result
0,2000,1,Australia,1,1
1,2000,1,India,0,0
2,2000,2,South Africa,1,1
3,2000,2,England,0,0
4,2000,3,South Africa,1,0
5,2000,3,England,0,1
6,2000,4,India,1,0
7,2000,4,South Africa,0,1
8,2000,5,Sri Lanka,0,1
9,2000,5,Pakistan,1,0


In [384]:
team_results['RollingForm'] = team_results.groupby('Team')['Result'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))

# Fill NaN values (for teams with fewer than 5 prior matches) with a default (0.5 for average form)
team_results['RollingForm'] = team_results['RollingForm'].fillna(0.5)

team_results

Unnamed: 0,Year,Match ID,Team,IsHome,Result,RollingForm
0,2000,1,Australia,1,1,0.5
1,2000,1,India,0,0,0.5
2,2000,2,South Africa,1,1,0.5
3,2000,2,England,0,0,0.5
4,2000,3,South Africa,1,0,1.0
...,...,...,...,...,...,...
1747,2025,874,Australia,0,1,0.8
1748,2025,875,Sri Lanka,1,0,0.4
1749,2025,875,Australia,0,1,0.8
1750,2025,876,Zimbabwe,1,0,0.0


In [385]:
home_form = team_results[team_results['IsHome'] == 1].rename(columns={'Team': 'HomeTeam', 'RollingForm': 'HomeTeamForm'})[['Match ID', 'HomeTeam', 'HomeTeamForm']]
away_form = team_results[team_results['IsHome'] == 0].rename(columns={'Team': 'AwayTeam', 'RollingForm': 'AwayTeamForm'})[['Match ID', 'AwayTeam', 'AwayTeamForm']]

In [386]:
#Merging above dataframes with the main dataframe (test_data)
test_data = pd.merge(test_data, home_form, on=['Match ID', 'HomeTeam'], how='left')
test_data = pd.merge(test_data, away_form, on=['Match ID', 'AwayTeam'], how='left')

test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8


In [387]:
test_data['home_advantage'] = 1 #always set it to 1 because home team is being predicted
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1


In [388]:
test_data['HomeTeamWin'] = (test_data['Winner'] == test_data['HomeTeam']).astype(int)
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0


In [389]:
#Calculating ELO
starting_ELO = 1500
max_ELO_gain = 30
home_boost = 50 #How many Elo points home advantage is worth

#Initialize Elo ratings for all unique teams
unique_team_names = pd.concat([test_data['HomeTeam'], test_data['AwayTeam']]).unique()
elo_ratings = {team: starting_ELO for team in unique_team_names}
unique_team_names

array(['Australia', 'South Africa', 'India', 'Pakistan', 'New Zealand',
       'West Indies', 'England', 'Sri Lanka', 'Zimbabwe', 'Bangladesh',
       'Ireland', 'Afghanistan', 'ICC World XI'], dtype=object)

In [390]:
#Lists to store Elo ratings for each match
home_team_elos_at_match = []
away_team_elos_at_match = []

In [391]:
#Iterate through matches to update ELO ratings
for index, row in test_data.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    winner = row['HomeTeamWin']

    #Get current ELO ratings
    home_elo = elo_ratings[home_team]
    away_elo = elo_ratings[away_team]

    #Store ELO ratings *before* the match
    home_team_elos_at_match.append(home_elo)
    away_team_elos_at_match.append(away_elo)

    #Apply Home Advantage Boost
    home_elo = home_elo + home_boost

    # Calculate expected win probabilities
    # E_A = 1 / (1 + 10^((R_B - R_A) / 400))
    expected_home_win = 1 / (1 + 10**((away_elo - home_elo) / 400))
    expected_away_win = 1 / (1 + 10**((home_elo - away_elo) / 400))

    #Store the real winner
    score_home = 0
    score_away = 0

    if winner == home_team:
        score_home = 1
        score_away = 0
    elif winner == away_team:
        score_home = 0
        score_away = 1

    #Update ELO ratings (we will make the rating change more pronounced if they win despite not being expected to)
    new_home_elo = home_elo + max_ELO_gain * (score_home - 1.2 * expected_home_win)
    new_away_elo = away_elo + max_ELO_gain * (score_away - 1.2 * expected_away_win)

    #Update ELO list
    elo_ratings[home_team] = new_home_elo
    elo_ratings[away_team] = new_away_elo

#Add the ELO ratings to the dataframe
test_data['Home ELO'] = home_team_elos_at_match
test_data['Away ELO'] = away_team_elos_at_match

In [392]:
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1529.427328,1484.572672
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,1484.572672,1556.631645
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2539.003643,2630.580497
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2573.147440,2610.436700
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,2703.159316,2919.004293
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,2743.153313,2893.010296


In [393]:
#Elo Difference (Home Team Elo - Away Team Elo)
test_data['elo_difference'] = test_data['Home ELO'] - test_data['Away ELO']
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1529.427328,1484.572672,44.854656
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,1484.572672,1556.631645,-72.058973
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1500.000000,1500.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2539.003643,2630.580497,-91.576854
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2573.147440,2610.436700,-37.289260
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,2703.159316,2919.004293,-215.844977
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,2743.153313,2893.010296,-149.856982


In [394]:
X = test_data[['HomeTeamForm', 'AwayTeamForm', 'home_advantage', 'elo_difference']]
y = test_data['HomeTeamWin']

In [395]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")


Training set size: 700 samples
Testing set size: 176 samples


In [396]:
# Scale numerical features
scaler = StandardScaler()
X_train[['HomeTeamForm', 'AwayTeamForm', 'elo_difference']] = scaler.fit_transform(X_train[['HomeTeamForm', 'AwayTeamForm', 'elo_difference']])
X_test[['HomeTeamForm', 'AwayTeamForm', 'elo_difference']] = scaler.transform(X_test[['HomeTeamForm', 'AwayTeamForm', 'elo_difference']])

X_train

Unnamed: 0,HomeTeamForm,AwayTeamForm,home_advantage,elo_difference
570,-1.028671,0.365645,1,0.699909
120,0.340608,0.365645,1,0.089197
639,0.340608,0.365645,1,-1.190523
6,-1.713311,1.753857,1,0.055090
170,1.025248,0.365645,1,-0.419911
...,...,...,...,...
9,-0.001712,0.018592,1,-0.145545
599,-1.028671,1.059751,1,0.314511
27,-0.857511,-1.716673,1,-0.122680
691,1.709888,-1.022567,1,0.710029


In [397]:
model = LogisticRegression(random_state=1, solver = 'liblinear')
model.fit(X_train, y_train)

In [398]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of Home Team winning (class 1)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7045


In [399]:
model.coef_

array([[ 0.48906607, -0.41580749,  0.27987751,  0.2914978 ]])

In [400]:
model.intercept_

array([0.27987751])

In [401]:
y_pred_proba

array([0.59430265, 0.56031307, 0.32777972, 0.71976238, 0.72690038,
       0.74368852, 0.80836532, 0.62992566, 0.51056196, 0.44318879,
       0.47435741, 0.89146738, 0.79442901, 0.85539876, 0.68255638,
       0.40681296, 0.47498119, 0.57543532, 0.62617828, 0.66677321,
       0.27493071, 0.38007431, 0.22747392, 0.38969193, 0.68699064,
       0.5331039 , 0.57292291, 0.90165216, 0.62927558, 0.33847984,
       0.65506771, 0.75001285, 0.90169856, 0.76679194, 0.80941641,
       0.55636947, 0.59736204, 0.75420075, 0.67311364, 0.88537702,
       0.55839546, 0.32287323, 0.52995403, 0.3822412 , 0.65718616,
       0.3044138 , 0.44326322, 0.67151531, 0.22519644, 0.68134959,
       0.77547993, 0.68419831, 0.75805903, 0.87675551, 0.3644941 ,
       0.5838771 , 0.85287605, 0.8015368 , 0.77648754, 0.64858554,
       0.3843274 , 0.65298933, 0.55995244, 0.89728824, 0.50430136,
       0.43646448, 0.26010601, 0.54788878, 0.44354423, 0.40218884,
       0.72419108, 0.70676602, 0.73155016, 0.72080701, 0.79124

In [402]:
#Calculating win probability for IND vs PAK
ind_elo_current = elo_ratings.get('India', starting_ELO) # Get latest ELO, or initial if not found
pak_elo_current = elo_ratings.get('Pakistan', starting_ELO)

print(f"India (ELO: {ind_elo_current:.0f}) vs Pakistan (ELO: {pak_elo_current:.0f})")

#Getting their recent form
ind_home_games = test_data[test_data['HomeTeam'] == 'India'].sort_values(by = 'Match ID', ascending = False)
ind_recent_form = ind_home_games['HomeTeamForm'].iloc[0]

pak_home_games = test_data[test_data['AwayTeam'] == 'Pakistan'].sort_values(by = 'Match ID', ascending = False)
pak_recent_form = pak_home_games['AwayTeamForm'].iloc[0]

print(f"India Recent Form: Won {ind_recent_form * 5: .0f} of last 5 games")
print(f"Pakistan Recent Form: Won {pak_recent_form * 5: .0f} of last 5 games")

India (ELO: 2860) vs Pakistan (ELO: 2604)
India Recent Form: Won  3 of last 5 games
Pakistan Recent Form: Won  2 of last 5 games


In [403]:
#Calculate features for this new match
new_elo_difference = ind_elo_current - pak_elo_current
new_home_advantage = 1
new_recent_form_home = ind_recent_form
new_recent_form_away = pak_recent_form

#Create a DataFrame for prediction
new_match_features = pd.DataFrame([[new_recent_form_home, new_recent_form_away, new_home_advantage, new_elo_difference]],
                                  columns=['HomeTeamForm', 'AwayTeamForm', 'home_advantage', 'elo_difference'])

new_match_features

Unnamed: 0,HomeTeamForm,AwayTeamForm,home_advantage,elo_difference
0,0.6,0.4,1,255.397251


In [404]:
#Predict
prob_home_win = model.predict_proba(new_match_features)[:, 1][0]
prediction = model.predict(new_match_features)[0]
outcome = "Home Team Wins" if prediction == 1 else "Home Team Loses"

print(f"  Predicted probability of India winning: {prob_home_win:.4f}")
print(f"  Predicted outcome: {outcome}")

  Predicted probability of India winning: 1.0000
  Predicted outcome: Home Team Wins


In [405]:
#Looking at above, the model is giving an extreme probability. Let's add an extra feature to help with this.

In [406]:
#Adding More Features to help with fixing extreme probability
test_data['Victory Type'] = np.where(test_data['Margin'].str.contains("wickets", na=False), test_data['Margin'], np.nan)

test_data['Victory Type'] = test_data['Victory Type'].replace('NaN', 'Runs', regex=True)
test_data['Victory Type'] = test_data['Victory Type'].fillna('Runs')

test_data.loc[test_data['Victory Type'].str.contains("wickets"), 'Victory Type'] = "Wickets"

test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference,Victory Type
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000,Runs
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000,Runs
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1529.427328,1484.572672,44.854656,Wickets
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,1484.572672,1556.631645,-72.058973,Wickets
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1500.000000,1500.000000,0.000000,Wickets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2539.003643,2630.580497,-91.576854,Runs
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2573.147440,2610.436700,-37.289260,Runs
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,2703.159316,2919.004293,-215.844977,Runs
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,2743.153313,2893.010296,-149.856982,Wickets


In [407]:
#REMEMBER:
# 0 -> Win By Wickets (Bowled First)
# 1 -> Win By Runs (Batted First)

test_data['Victory Type'] = (test_data['Victory Type'] == "Runs").astype(int)
test_data

Unnamed: 0,HomeTeam,AwayTeam,Winner,Margin,Ground,Match Date,Match ID,Format,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference,Victory Type
0,Australia,India,Australia,inns & 141 runs,Sydney,"Jan 2-4, 2000",1,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000,1
1,South Africa,England,South Africa,inns & 37 runs,Cape Town,"Jan 2-5, 2000",2,Test,2000,0.5,0.5,1,1,1500.000000,1500.000000,0.000000,1
2,South Africa,England,England,2 wickets,Centurion,"Jan 14-18, 2000",3,Test,2000,1.0,0.0,1,0,1529.427328,1484.572672,44.854656,0
3,India,South Africa,South Africa,4 wickets,Wankhede,"Feb 24-26, 2000",4,Test,2000,0.0,0.5,1,0,1484.572672,1556.631645,-72.058973,0
4,Pakistan,Sri Lanka,Sri Lanka,2 wickets,Rawalpindi,"Feb 26-Mar 1, 2000",5,Test,2000,0.5,0.5,1,0,1500.000000,1500.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,Pakistan,West Indies,Pakistan,127 runs,Multan,"Jan 17-19, 2025",872,Test,2025,0.4,0.2,1,1,2539.003643,2630.580497,-91.576854,1
872,Pakistan,West Indies,West Indies,120 runs,Multan,"Jan 25-27, 2025",873,Test,2025,0.6,0.2,1,0,2573.147440,2610.436700,-37.289260,1
873,Sri Lanka,Australia,Australia,inns & 242 runs,Galle,"Jan 29-Feb 1, 2025",874,Test,2025,0.6,0.8,1,0,2703.159316,2919.004293,-215.844977,1
874,Sri Lanka,Australia,Australia,9 wickets,Galle,"Feb 6-9, 2025",875,Test,2025,0.4,0.8,1,0,2743.153313,2893.010296,-149.856982,0


In [408]:
test_data.describe()

Unnamed: 0,Match ID,Year,HomeTeamForm,AwayTeamForm,home_advantage,HomeTeamWin,Home ELO,Away ELO,elo_difference,Victory Type
count,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0,876.0
mean,438.5,2012.105023,0.499715,0.498916,1.0,0.619863,2133.624429,2102.651029,30.9734,0.664384
std,253.023714,7.520208,0.29198,0.287317,0.0,0.485698,383.949873,384.277357,218.574713,0.472475
min,1.0,2000.0,0.0,0.0,1.0,0.0,1455.941027,1467.290658,-973.821636,0.0
25%,219.75,2005.0,0.2,0.2,1.0,0.0,1803.668001,1783.33687,-80.297319,0.0
50%,438.5,2012.0,0.6,0.6,1.0,1.0,2122.307504,2064.456461,17.69657,1.0
75%,657.25,2018.0,0.8,0.8,1.0,1.0,2468.810491,2420.606824,120.32128,1.0
max,876.0,2025.0,1.0,1.0,1.0,1.0,2911.005973,2934.294807,1154.864563,1.0


In [409]:
X = test_data[['HomeTeamForm', 'AwayTeamForm', 'home_advantage', 'elo_difference', 'Victory Type']]
y = test_data['HomeTeamWin']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1, stratify=y)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

X_train


Training set size: 700 samples
Testing set size: 176 samples


Unnamed: 0,HomeTeamForm,AwayTeamForm,home_advantage,elo_difference,Victory Type
570,0.20,0.6,1,189.012458,0
120,0.60,0.6,1,52.479803,0
639,0.60,0.6,1,-233.618738,1
6,0.00,1.0,1,44.854656,1
170,0.80,0.6,1,-61.338024,0
...,...,...,...,...,...
9,0.50,0.5,1,0.000000,1
599,0.20,0.8,1,102.851736,1
27,0.25,0.0,1,5.111725,0
691,1.00,0.2,1,191.274943,1


In [410]:
model = LogisticRegression(random_state=1, solver='liblinear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of Home Team winning (class 1)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6818
