In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [21]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path().resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"

matches = pd.read_csv(RAW_DIR / "matches.csv")
deliveries = pd.read_csv(RAW_DIR / "deliveries.csv")

matches["date"] = pd.to_datetime(matches["date"])

print("Matches shape:", matches.shape)
print("Deliveries shape:", deliveries.shape)

Matches shape: (1095, 20)
Deliveries shape: (260920, 17)


In [26]:
print("Deliveries columns:")
print(deliveries.columns.tolist())

print("\nMatches columns:")
print(matches.columns.tolist())

Deliveries columns:
['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs', 'total_runs', 'extras_type', 'is_wicket', 'player_dismissed', 'dismissal_kind', 'fielder', 'id_x', 'date_x', 'venue_x', 'team1_x', 'team2_x', 'id_y', 'date_y', 'venue_y', 'team1_y', 'team2_y']

Matches columns:
['id', 'season', 'city', 'date', 'match_type', 'player_of_match', 'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'result', 'result_margin', 'target_runs', 'target_overs', 'super_over', 'method', 'umpire1', 'umpire2']


In [27]:
deliveries = deliveries.merge(
    matches[["id","date","venue","team1","team2"]],
    left_on="match_id",
    right_on="id",
    how="left"
)

print("After merge:", deliveries.shape)
print(deliveries.head())

After merge: (260920, 32)
   match_id  inning           batting_team                 bowling_team  over  \
0    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
1    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
2    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
3    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
4    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   

   ball       batter   bowler  non_striker  batsman_runs  ...    id_y  \
0     1   SC Ganguly  P Kumar  BB McCullum             0  ...  335982   
1     2  BB McCullum  P Kumar   SC Ganguly             0  ...  335982   
2     3  BB McCullum  P Kumar   SC Ganguly             0  ...  335982   
3     4  BB McCullum  P Kumar   SC Ganguly             0  ...  335982   
4     5  BB McCullum  P Kumar   SC Ganguly             0  ...  335982   

      date_y                venue_y             

In [28]:
batter_match = deliveries.groupby(
    ["match_id","batter","date","venue","team1","team2"],
    as_index=False
).agg(
    runs=("batsman_runs","sum"),
    balls=("ball","count")
)

batter_match["strike_rate"] = (
    batter_match["runs"] / batter_match["balls"] * 100
)

batter_match = batter_match.sort_values(["batter","date"])

print("Batter match shape:", batter_match.shape)
batter_match.head()

Batter match shape: (16515, 9)


Unnamed: 0,match_id,batter,date,venue,team1,team2,runs,balls,strike_rate
4299,548346,A Ashish Reddy,2012-04-29,Wankhede Stadium,Mumbai Indians,Deccan Chargers,10,10,100.0
4390,548352,A Ashish Reddy,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,3,3,100.0
4496,548359,A Ashish Reddy,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,8,8,100.0
4699,548373,A Ashish Reddy,2012-05-18,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,10,4,250.0
4747,548376,A Ashish Reddy,2012-05-20,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,4,5,80.0


In [29]:
batter_match["runs_last_5_avg"] = (
    batter_match.groupby("batter")["runs"]
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)

batter_match["runs_last_10_avg"] = (
    batter_match.groupby("batter")["runs"]
    .rolling(10, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)

print("After rolling:", batter_match.shape)
batter_match.head()

After rolling: (16515, 11)


Unnamed: 0,match_id,batter,date,venue,team1,team2,runs,balls,strike_rate,runs_last_5_avg,runs_last_10_avg
4299,548346,A Ashish Reddy,2012-04-29,Wankhede Stadium,Mumbai Indians,Deccan Chargers,10,10,100.0,10.0,10.0
4390,548352,A Ashish Reddy,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,3,3,100.0,6.5,6.5
4496,548359,A Ashish Reddy,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,8,8,100.0,7.0,7.0
4699,548373,A Ashish Reddy,2012-05-18,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,10,4,250.0,7.75,7.75
4747,548376,A Ashish Reddy,2012-05-20,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,4,5,80.0,7.0,7.0


In [30]:
batter_match["career_runs_avg"] = (
    batter_match.groupby("batter")["runs"]
    .expanding()
    .mean()
    .reset_index(0, drop=True)
)

batter_match["venue_runs_avg"] = (
    batter_match.groupby(["batter","venue"])["runs"]
    .transform("mean")
)

batter_match["pvt_runs_avg"] = (
    batter_match.groupby(["batter","team2"])["runs"]
    .transform("mean")
)

print("After venue/opponent:", batter_match.shape)

After venue/opponent: (16515, 14)


In [31]:
batter_match["target_next_runs"] = (
    batter_match.groupby("batter")["runs"].shift(-1)
)

print("Before dropna:", batter_match.shape)

batter_match = batter_match.dropna()

print("After dropna:", batter_match.shape)

Before dropna: (16515, 15)
After dropna: (15842, 15)


In [32]:
PROCESSED_DIR = BASE_DIR / "data" / "processed"

batter_match.to_csv(
    PROCESSED_DIR / "dataset_new.csv",
    index=False
)

print("Saved dataset_new.csv")

Saved dataset_new.csv


In [33]:
import pandas as pd
import numpy as np
from pathlib import Path

In [34]:
BASE_DIR = Path().resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DIR = BASE_DIR / "data" / "processed"

matches = pd.read_csv(RAW_DIR / "matches.csv")
deliveries = pd.read_csv(RAW_DIR / "deliveries.csv")

matches["date"] = pd.to_datetime(matches["date"])

print("Matches:", matches.shape)
print("Deliveries:", deliveries.shape)

Matches: (1095, 20)
Deliveries: (260920, 17)


In [35]:
# Merge match info into deliveries
deliveries = deliveries.merge(
    matches[["id","date","venue","team1","team2"]],
    left_on="match_id",
    right_on="id",
    how="left"
)

print("After merge:", deliveries.shape)
print("Date missing:", deliveries["date"].isna().sum())

After merge: (260920, 22)
Date missing: 0


In [36]:
# Ensure wicket column is clean
deliveries["is_wicket"] = deliveries["is_wicket"].fillna(0).astype(int)

bowler_match = deliveries.groupby(
    ["match_id","bowler","date","venue","team1","team2"],
    as_index=False
).agg(
    balls=("ball","count"),
    runs=("total_runs","sum"),
    wickets=("is_wicket","sum")
)

print("Bowler match shape:", bowler_match.shape)
bowler_match.head()

Bowler match shape: (12978, 9)


Unnamed: 0,match_id,bowler,date,venue,team1,team2,balls,runs,wickets
0,335982,AA Noffke,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,25,41,1
1,335982,AB Agarkar,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,28,25,3
2,335982,AB Dinda,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,20,9,2
3,335982,CL White,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,7,24,0
4,335982,I Sharma,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,19,13,1


In [37]:
bowler_match["overs"] = bowler_match["balls"] / 6

bowler_match["economy"] = (
    bowler_match["runs"] / bowler_match["overs"]
)

bowler_match["economy"] = bowler_match["economy"].replace(
    [np.inf, -np.inf], 0
)

bowler_match = bowler_match.sort_values(
    ["bowler","date"]
)

print("After overs/economy:", bowler_match.shape)

After overs/economy: (12978, 11)


In [38]:
bowler_match["wickets_last_5"] = (
    bowler_match.groupby("bowler")["wickets"]
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)

print("After rolling:", bowler_match.shape)

After rolling: (12978, 12)


In [39]:
bowler_match["career_wickets_avg"] = (
    bowler_match.groupby("bowler")["wickets"]
    .expanding()
    .mean()
    .reset_index(0, drop=True)
)

In [40]:
bowler_match["venue_wickets_avg"] = (
    bowler_match.groupby(["bowler","venue"])["wickets"]
    .transform("mean")
)

In [41]:
bowler_match["target_next_wickets"] = (
    bowler_match.groupby("bowler")["wickets"]
    .shift(-1)
)

print("Before dropna:", bowler_match.shape)

bowler_match = bowler_match.dropna()

print("After dropna:", bowler_match.shape)

Before dropna: (12978, 15)
After dropna: (12448, 15)


In [42]:
print("Final columns:")
print(bowler_match.columns.tolist())

Final columns:
['match_id', 'bowler', 'date', 'venue', 'team1', 'team2', 'balls', 'runs', 'wickets', 'overs', 'economy', 'wickets_last_5', 'career_wickets_avg', 'venue_wickets_avg', 'target_next_wickets']


In [43]:
bowler_match.to_csv(
    PROCESSED_DIR / "bowler_dataset.csv",
    index=False
)

print("✅ bowler_dataset.csv saved successfully")

✅ bowler_dataset.csv saved successfully
