In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# 1. Load CSVs (already uploaded)
deliveries = pd.read_csv("deliveries.csv")
matches = pd.read_csv("matches.csv")

In [3]:
deliveries.head(1)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,


In [4]:
matches.head(1)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen


In [5]:
deliveries.isnull().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batter                   0
bowler                   0
non_striker              0
batsman_runs             0
extra_runs               0
total_runs               0
extras_type         246795
is_wicket                0
player_dismissed    247970
dismissal_kind      247970
fielder             251566
dtype: int64

In [6]:
matches.isna().sum()

id                    0
season                0
city                 51
date                  0
match_type            0
player_of_match       5
venue                 0
team1                 0
team2                 0
toss_winner           0
toss_decision         0
winner                5
result                0
result_margin        19
target_runs           3
target_overs          3
super_over            0
method             1074
umpire1               0
umpire2               0
dtype: int64

In [7]:
matches[matches['winner'].isnull()]

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
241,501265,2011,Delhi,2011-05-21,League,,Feroz Shah Kotla,Delhi Daredevils,Pune Warriors,Delhi Daredevils,bat,,no result,,,,N,,SS Hazare,RJ Tucker
485,829763,2015,Bangalore,2015-04-29,League,,M Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,no result,,,,N,,JD Cloete,PG Pathak
511,829813,2015,Bangalore,2015-05-17,League,,M Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,field,,no result,,188.0,20.0,N,,HDPK Dharmasena,K Srinivasan
744,1178424,2019,Bengaluru,2019-04-30,League,,M.Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,no result,,63.0,5.0,N,,NJ Llong,UV Gandhe
994,1359519,2023,Lucknow,2023-05-03,League,,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Lucknow Super Giants,Chennai Super Kings,Chennai Super Kings,field,,no result,,,,N,,AK Chaudhary,NA Patwardhan


In [8]:
# droping those rows where winner is null
matches = matches.dropna(subset=['winner'])

In [9]:
# 2. Prepare match outcome data
matches_clean = matches.rename(columns={"id": "match_id"}).copy()

In [10]:
# creating a column which indicates if team1 won the match
matches_clean["won"] = (matches_clean["winner"] == matches_clean["team1"]).astype(int)


In [11]:
# seprate the data of first inngs from deliveries
first_innings = deliveries[deliveries["inning"] == 1].copy()

In [12]:
# aggeragating the data of first inngs


agg_features = first_innings.groupby("match_id").agg(
    total_runs_1st_inns=("total_runs", "sum"),
    wickets_1st_inns=("is_wicket", "sum"),
    dot_balls=("total_runs", lambda x: (x == 0).sum()),
    total_balls=("ball", "count"),
    powerplay_runs=("total_runs", lambda x: x[first_innings["over"] <= 6].sum()),
    death_overs_runs=("total_runs", lambda x: x[first_innings["over"] >= 16].sum()),
    extras_given=("extra_runs", "sum")
).reset_index()

In [13]:
# generating a column wich indicates dot balls ratio


agg_features["dot_ball_ratio"] = agg_features["dot_balls"] / agg_features["total_balls"]
agg_features.drop(columns=["dot_balls", "total_balls"], inplace=True)

In [14]:

# 4. Merge match info with delivery dataset
match_model_df = matches_clean.merge(agg_features, on="match_id")

In [15]:
pd.set_option('display.max_columns', None)  # Show all columns in the DataFrame

match_model_df.head(2)

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,won,total_runs_1st_inns,wickets_1st_inns,powerplay_runs,death_overs_runs,extras_given,dot_ball_ratio
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen,0,222,3,68,64,17,0.290323
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri,0,240,5,62,71,6,0.274194


In [16]:
# 5. Add context columns
match_model_df["chasing_team"] = match_model_df["team2"]
match_model_df["bowling_team"] = match_model_df["team1"]
match_model_df["venue"] = match_model_df["venue"]


In [17]:
match_model_df.head(2)

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,won,total_runs_1st_inns,wickets_1st_inns,powerplay_runs,death_overs_runs,extras_given,dot_ball_ratio,chasing_team,bowling_team
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen,0,222,3,68,64,17,0.290323,Kolkata Knight Riders,Royal Challengers Bangalore
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri,0,240,5,62,71,6,0.274194,Chennai Super Kings,Kings XI Punjab


In [18]:
# 6. Compute team strengths
match_stats = matches_clean.copy()
match_stats["team1_win"] = (match_stats["team1"] == match_stats["winner"]).astype(int)
match_stats["team2_win"] = (match_stats["team2"] == match_stats["winner"]).astype(int)

team1_strength = match_stats.groupby("team1")["team1_win"].mean().reset_index().rename(
    columns={"team1": "team", "team1_win": "chasing_strength"})
match_model_df = match_model_df.merge(team1_strength, left_on="chasing_team", right_on="team", how="left").drop(columns=["team"])


In [19]:
match_model_df.head(2)

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,won,total_runs_1st_inns,wickets_1st_inns,powerplay_runs,death_overs_runs,extras_given,dot_ball_ratio,chasing_team,bowling_team,chasing_strength
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen,0,222,3,68,64,17,0.290323,Kolkata Knight Riders,Royal Challengers Bangalore,0.53719
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri,0,240,5,62,71,6,0.274194,Chennai Super Kings,Kings XI Punjab,0.585938


### Some column engineering

In [20]:
# 7. Head-to-head win rate
matches_stats = matches.copy()
matches_stats["match_id"] = matches_stats["id"]
matches_stats["date"] = pd.to_datetime(matches_stats["date"], errors='coerce')
matches_stats = matches_stats.dropna(subset=["winner", "team1", "team2"])
matches_stats["chasing_team"] = matches_stats["team2"]
matches_stats["bowling_team"] = matches_stats["team1"]
matches_stats["chasing_win"] = (matches_stats["chasing_team"] == matches_stats["winner"]).astype(int)


In [21]:
h2h_df = matches_stats.groupby(["chasing_team", "bowling_team"]).agg(
    total_matches=("match_id", "count"),
    wins=("chasing_win", "sum")
).reset_index()
h2h_df["h2h_winrate"] = h2h_df["wins"] / h2h_df["total_matches"]
match_model_df = match_model_df.merge(h2h_df[["chasing_team", "bowling_team", "h2h_winrate"]],
                                       on=["chasing_team", "bowling_team"], how="left")

In [22]:
h2h_df.head(2)

Unnamed: 0,chasing_team,bowling_team,total_matches,wins,h2h_winrate
0,Chennai Super Kings,Deccan Chargers,3,2,0.666667
1,Chennai Super Kings,Delhi Capitals,5,3,0.6


In [23]:
# 8. Venue win rate
venue_df = matches_stats.groupby(["chasing_team", "venue"]).agg(
    venue_matches=("match_id", "count"),
    venue_wins=("chasing_win", "sum")
).reset_index()
venue_df["venue_winrate"] = venue_df["venue_wins"] / venue_df["venue_matches"]
match_model_df = match_model_df.merge(venue_df[["chasing_team", "venue", "venue_winrate"]],
                                       on=["chasing_team", "venue"], how="left")

In [24]:
match_model_df.head(2)

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,won,total_runs_1st_inns,wickets_1st_inns,powerplay_runs,death_overs_runs,extras_given,dot_ball_ratio,chasing_team,bowling_team,chasing_strength,h2h_winrate,venue_winrate
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen,0,222,3,68,64,17,0.290323,Kolkata Knight Riders,Royal Challengers Bangalore,0.53719,0.526316,0.5
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri,0,240,5,62,71,6,0.274194,Chennai Super Kings,Kings XI Punjab,0.585938,0.636364,0.75


In [25]:

# 9. Recent form: last 5 matches
matches_stats = matches_stats.sort_values(["chasing_team", "date"])
recent_form = matches_stats[["match_id", "chasing_team", "date", "chasing_win"]].copy()
recent_form["chasing_recent_winrate"] = (
    recent_form.groupby("chasing_team")["chasing_win"]
    .shift()
    .rolling(window=5)
    .mean()
    .reset_index(drop=True)
)

In [26]:

match_model_df = match_model_df.merge(
    recent_form[["match_id", "chasing_recent_winrate"]],
    on="match_id", how="left"
)

match_model_df.head(2)

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2,won,total_runs_1st_inns,wickets_1st_inns,powerplay_runs,death_overs_runs,extras_given,dot_ball_ratio,chasing_team,bowling_team,chasing_strength,h2h_winrate,venue_winrate,chasing_recent_winrate
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,,Asad Rauf,RE Koertzen,0,222,3,68,64,17,0.290323,Kolkata Knight Riders,Royal Challengers Bangalore,0.53719,0.526316,0.5,
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,241.0,20.0,N,,MR Benson,SL Shastri,0,240,5,62,71,6,0.274194,Chennai Super Kings,Kings XI Punjab,0.585938,0.636364,0.75,


In [27]:
# Drop rows with missing values
model_data = match_model_df.dropna(subset=[
    "chasing_team", "bowling_team", "venue", "target_runs", "total_runs_1st_inns",
    "dot_ball_ratio", "extras_given", "wickets_1st_inns", "chasing_strength",
    "h2h_winrate", "venue_winrate"
]).copy()

# Fill missing recent form
model_data["chasing_recent_winrate"] = model_data["chasing_recent_winrate"].fillna(
    model_data["chasing_recent_winrate"].median()
)


In [28]:

# Label encoding
le_team = LabelEncoder()
le_venue = LabelEncoder()

model_data["chasing_team_encoded"] = le_team.fit_transform(model_data["chasing_team"])
model_data["bowling_team_encoded"] = le_team.transform(model_data["bowling_team"])
model_data["venue_encoded"] = le_venue.fit_transform(model_data["venue"])


In [29]:

# Save mappings for decoding
team_mapping = dict(zip(le_team.classes_, le_team.transform(le_team.classes_)))
venue_mapping = dict(zip(le_venue.classes_, le_venue.transform(le_venue.classes_)))
reverse_team_mapping = {v: k for k, v in team_mapping.items()}
reverse_venue_mapping = {v: k for k, v in venue_mapping.items()}

In [30]:
# Features and target
features = [
    "chasing_team_encoded", "bowling_team_encoded", "venue_encoded",
    "target_runs", "total_runs_1st_inns", "dot_ball_ratio", "extras_given",
    "wickets_1st_inns", "chasing_strength", "h2h_winrate", "venue_winrate",
    "chasing_recent_winrate"
]
X = model_data[features]
y = model_data["won"]

In [31]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [33]:
# Predict and evaluate
y_pred = clf.predict(X_test)

In [34]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7339449541284404


In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73       110
           1       0.72      0.75      0.74       108

    accuracy                           0.73       218
   macro avg       0.73      0.73      0.73       218
weighted avg       0.73      0.73      0.73       218



In [36]:
# Create a sample match input using real encoded values
sample_input = {
    "chasing_team": "Mumbai Indians",
    "bowling_team": "Chennai Super Kings",
    "venue": "Wankhede Stadium",
    "target_runs": 175,
    "total_runs_1st_inns": 174,
    "dot_ball_ratio": 0.30,
    "extras_given": 12,
    "wickets_1st_inns": 6,
    "chasing_strength": 0.62,
    "h2h_winrate": 0.55,
    "venue_winrate": 0.60,
    "chasing_recent_winrate": 0.6
}

# Encode input
encoded_sample = {
    "chasing_team_encoded": team_mapping.get(sample_input["chasing_team"], -1),
    "bowling_team_encoded": team_mapping.get(sample_input["bowling_team"], -1),
    "venue_encoded": venue_mapping.get(sample_input["venue"], -1),
    "target_runs": sample_input["target_runs"],
    "total_runs_1st_inns": sample_input["total_runs_1st_inns"],
    "dot_ball_ratio": sample_input["dot_ball_ratio"],
    "extras_given": sample_input["extras_given"],
    "wickets_1st_inns": sample_input["wickets_1st_inns"],
    "chasing_strength": sample_input["chasing_strength"],
    "h2h_winrate": sample_input["h2h_winrate"],
    "venue_winrate": sample_input["venue_winrate"],
    "chasing_recent_winrate": sample_input["chasing_recent_winrate"]
}

# Predict
input_df = pd.DataFrame([encoded_sample])
prediction = clf.predict(input_df)[0]
prediction_prob = clf.predict_proba(input_df)[0][prediction]
predicted_team = sample_input["chasing_team"] if prediction == 1 else sample_input["bowling_team"]

prediction, prediction_prob, predicted_team


(0, 0.7, 'Chennai Super Kings')