In [1]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)


PROJECT_DIR: c:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction


In [2]:
import pandas as pd
import numpy as np

from src.data_cleaning import run_cleaning_pipeline
from src.feature_engineering import run_feature_engineering
from src.feature_pipeline import save_pipeline
from src.config import PROCESSED_DIR


In [3]:
matches_clean, deliveries_clean = run_cleaning_pipeline()

print("Matches clean:", matches_clean.shape)
print("Deliveries clean:", deliveries_clean.shape)


Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\raw\matches.csv
Shape: (1095, 20)
Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\raw\deliveries.csv
Shape: (260920, 17)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\matches_clean.csv | Shape: (1095, 7)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\deliveries_clean.csv | Shape: (260920, 17)
Matches clean: (1095, 7)
Deliveries clean: (260920, 17)


In [4]:
dataset = run_feature_engineering()
print("Shape:", dataset.shape)
print("Has target_next_runs?:", "target_next_runs" in dataset.columns)
dataset.head()


Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv | Shape: (15842, 17)
Shape: (15842, 17)
Has target_next_runs?: True


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [5]:
print(dataset.columns.tolist())


['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue', 'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg', 'career_runs_avg', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg', 'target_next_runs']


In [6]:
dataset.columns


Index(['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue',
       'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg',
       'career_runs_avg', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg',
       'target_next_runs'],
      dtype='object')

In [7]:
dataset.isnull().sum().sort_values(ascending=False).head(20)


winner              38
match_id             0
batter               0
balls_faced          0
runs                 0
season               0
venue                0
team1                0
date                 0
team2                0
runs_last_5_avg      0
runs_last_10_avg     0
career_runs_avg      0
venue_runs_avg       0
pvt_runs_avg         0
pvp_runs_avg         0
target_next_runs     0
dtype: int64

In [8]:
dataset["date"] = pd.to_datetime(dataset["date"], errors="coerce")
dataset = dataset.dropna(subset=["date"])
dataset = dataset.sort_values("date").reset_index(drop=True)

split_idx = int(len(dataset) * 0.8)

train_df = dataset.iloc[:split_idx].copy()
test_df = dataset.iloc[split_idx:].copy()

print("Train:", train_df.shape)
print("Test:", test_df.shape)
print("Train dates:", train_df["date"].min(), "->", train_df["date"].max())
print("Test dates:", test_df["date"].min(), "->", test_df["date"].max())


Train: (12673, 17)
Test: (3169, 17)
Train dates: 2008-04-18 00:00:00 -> 2021-10-15 00:00:00
Test dates: 2022-03-26 00:00:00 -> 2024-05-24 00:00:00


In [9]:
save_pipeline()


✅ Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\models\feature_pipeline.pkl


In [10]:
top_batters = dataset.groupby("batter")["runs"].sum().sort_values(ascending=False).head(10)
top_batters


batter
V Kohli           7981
S Dhawan          6755
DA Warner         6566
RG Sharma         6562
SK Raina          5533
MS Dhoni          5218
AB de Villiers    5170
CH Gayle          4996
RV Uthappa        4953
KD Karthik        4832
Name: runs, dtype: int64

In [11]:
sample_batter = dataset["batter"].value_counts().index[0]
print("Sample Batter:", sample_batter)

dataset[dataset["batter"] == sample_batter][
    ["date", "match_id", "batter", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg", "venue", "venue_runs_avg"]
].head(20)


Sample Batter: RG Sharma


Unnamed: 0,date,match_id,batter,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue,venue_runs_avg
47,2008-04-20,335986,RG Sharma,0,0.0,0.0,0.0,Eden Gardens,0.0
96,2008-04-22,335988,RG Sharma,66,0.0,0.0,0.0,"Rajiv Gandhi International Stadium, Uppal",0.0
131,2008-04-24,335990,RG Sharma,36,33.0,33.0,33.0,"Rajiv Gandhi International Stadium, Uppal",66.0
274,2008-05-01,335999,RG Sharma,76,34.0,34.0,34.0,"Rajiv Gandhi International Stadium, Uppal",51.0
324,2008-05-03,336034,RG Sharma,57,44.5,44.5,44.5,M Chinnaswamy Stadium,0.0
385,2008-05-06,336007,RG Sharma,23,47.0,47.0,47.0,"MA Chidambaram Stadium, Chepauk",0.0
439,2008-05-09,336011,RG Sharma,5,51.6,43.0,43.0,Sawai Mansingh Stadium,0.0
477,2008-05-11,336014,RG Sharma,33,39.4,37.571429,37.571429,"Rajiv Gandhi International Stadium, Uppal",59.333333
549,2008-05-15,336020,RG Sharma,35,38.8,37.0,37.0,Feroz Shah Kotla,0.0
608,2008-05-18,336024,RG Sharma,6,30.6,36.777778,36.777778,"Rajiv Gandhi International Stadium, Uppal",52.75


In [12]:
sample_df = dataset[dataset["batter"] == sample_batter].copy()
sample_df = sample_df.sort_values("date")

sample_df[["date", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg"]].head(15)


Unnamed: 0,date,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg
47,2008-04-20,0,0.0,0.0,0.0
96,2008-04-22,66,0.0,0.0,0.0
131,2008-04-24,36,33.0,33.0,33.0
274,2008-05-01,76,34.0,34.0,34.0
324,2008-05-03,57,44.5,44.5,44.5
385,2008-05-06,23,47.0,47.0,47.0
439,2008-05-09,5,51.6,43.0,43.0
477,2008-05-11,33,39.4,37.571429,37.571429
549,2008-05-15,35,38.8,37.0,37.0
608,2008-05-18,6,30.6,36.777778,36.777778


In [13]:
import os

print("Processed files:")
print(os.listdir(PROCESSED_DIR))

Processed files:
['bowler_match_features.csv', 'dataset.csv', 'deliveries_clean.csv', 'matches_clean.csv']


In [14]:
final_path = PROCESSED_DIR / "dataset.csv"
print("✅ Final dataset saved at:", final_path)

check_df = pd.read_csv(final_path)
print("Loaded back:", check_df.shape)
check_df.head()

✅ Final dataset saved at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv
Loaded back: (15842, 17)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [15]:
path = PROCESSED_DIR / "dataset.csv"
print("Saved dataset at:", path)

check = pd.read_csv(path)
print("Loaded back:", check.shape)
check.head()


Saved dataset at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv
Loaded back: (15842, 17)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [23]:
import pandas as pd
import numpy as np

In [24]:
# Load cleaned datasets
deliveries = pd.read_csv("../data/processed/deliveries_clean.csv")
matches = pd.read_csv("../data/processed/matches_clean.csv")

print("Deliveries:", deliveries.shape)
print("Matches:", matches.shape)

deliveries.head()

Deliveries: (260920, 17)
Matches: (1095, 7)


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [25]:
# Convert wicket flag to int
deliveries["wicket"] = deliveries["is_wicket"].astype(int)

deliveries[["bowler", "wicket"]].head()

Unnamed: 0,bowler,wicket
0,P Kumar,0
1,P Kumar,0
2,P Kumar,0
3,P Kumar,0
4,P Kumar,0


In [26]:
# Aggregate at bowler-match level
bowler_match = (
    deliveries
    .groupby(["match_id", "bowler"], as_index=False)
    .agg(
        total_wickets=("wicket", "sum"),
        runs_conceded=("total_runs", "sum"),
        balls_bowled=("ball", "count")
    )
)

bowler_match.head()

Unnamed: 0,match_id,bowler,total_wickets,runs_conceded,balls_bowled
0,335982,AA Noffke,1,41,25
1,335982,AB Agarkar,3,25,28
2,335982,AB Dinda,2,9,20
3,335982,CL White,0,24,7
4,335982,I Sharma,1,13,19


In [27]:
# Calculate overs
bowler_match["overs"] = bowler_match["balls_bowled"] / 6

# Economy rate
bowler_match["economy"] = (
    bowler_match["runs_conceded"] / bowler_match["overs"]
)

bowler_match.head()

Unnamed: 0,match_id,bowler,total_wickets,runs_conceded,balls_bowled,overs,economy
0,335982,AA Noffke,1,41,25,4.166667,9.84
1,335982,AB Agarkar,3,25,28,4.666667,5.357143
2,335982,AB Dinda,2,9,20,3.333333,2.7
3,335982,CL White,0,24,7,1.166667,20.571429
4,335982,I Sharma,1,13,19,3.166667,4.105263


In [29]:
print(matches.columns)

Index(['id', 'season', 'date', 'venue', 'team1', 'team2', 'winner'], dtype='object')


In [30]:
# Standardize match id
matches = matches.rename(columns={"id": "match_id"})

In [31]:
# Merge match date
bowler_match = bowler_match.merge(
    matches[["match_id", "date"]],
    on="match_id",
    how="left"
)

# Convert date to datetime
bowler_match["date"] = pd.to_datetime(bowler_match["date"])

# Sort chronologically
bowler_match = bowler_match.sort_values(["bowler", "date"])

bowler_match.head()

Unnamed: 0,match_id,bowler,total_wickets,runs_conceded,balls_bowled,overs,economy,date
3384,548341,A Ashish Reddy,2,32,24,4.0,8.0,2012-04-26
3442,548346,A Ashish Reddy,1,11,14,2.333333,4.714286,2012-04-29
3468,548348,A Ashish Reddy,1,32,19,3.166667,10.105263,2012-05-01
3518,548352,A Ashish Reddy,1,16,13,2.166667,7.384615,2012-05-04
3568,548356,A Ashish Reddy,1,36,25,4.166667,8.64,2012-05-06


In [32]:
# Rolling average wickets (last 5)
bowler_match["rolling_wkts_5"] = (
    bowler_match
    .groupby("bowler")["total_wickets"]
    .transform(lambda x: x.shift(1).rolling(5).mean())
)

# Rolling average wickets (last 10)
bowler_match["rolling_wkts_10"] = (
    bowler_match
    .groupby("bowler")["total_wickets"]
    .transform(lambda x: x.shift(1).rolling(10).mean())
)

bowler_match[[
    "bowler",
    "total_wickets",
    "rolling_wkts_5",
    "rolling_wkts_10"
]].head(10)

Unnamed: 0,bowler,total_wickets,rolling_wkts_5,rolling_wkts_10
3384,A Ashish Reddy,2,,
3442,A Ashish Reddy,1,,
3468,A Ashish Reddy,1,,
3518,A Ashish Reddy,1,,
3568,A Ashish Reddy,1,,
3604,A Ashish Reddy,2,1.2,
3267,A Ashish Reddy,0,1.2,
3765,A Ashish Reddy,0,1.0,
3799,A Ashish Reddy,3,0.8,
3893,A Ashish Reddy,1,1.2,


In [33]:
# Career averages
career_stats = (
    bowler_match
    .groupby("bowler")
    .agg(
        career_wkts_avg=("total_wickets", "mean"),
        career_economy=("economy", "mean")
    )
    .reset_index()
)

career_stats.head()

Unnamed: 0,bowler,career_wkts_avg,career_economy
0,A Ashish Reddy,0.95,9.37553
1,A Badoni,0.4,7.4
2,A Chandila,0.916667,6.548611
3,A Choudhary,1.0,7.884319
4,A Dananjaya,0.0,11.28


In [34]:
# Merge career stats
bowler_match = bowler_match.merge(
    career_stats,
    on="bowler",
    how="left"
)

bowler_match.head()

Unnamed: 0,match_id,bowler,total_wickets,runs_conceded,balls_bowled,overs,economy,date,rolling_wkts_5,rolling_wkts_10,career_wkts_avg,career_economy
0,548341,A Ashish Reddy,2,32,24,4.0,8.0,2012-04-26,,,0.95,9.37553
1,548346,A Ashish Reddy,1,11,14,2.333333,4.714286,2012-04-29,,,0.95,9.37553
2,548348,A Ashish Reddy,1,32,19,3.166667,10.105263,2012-05-01,,,0.95,9.37553
3,548352,A Ashish Reddy,1,16,13,2.166667,7.384615,2012-05-04,,,0.95,9.37553
4,548356,A Ashish Reddy,1,36,25,4.166667,8.64,2012-05-06,,,0.95,9.37553


In [35]:
# Target: wickets in next match
bowler_match["target_next_wickets"] = (
    bowler_match
    .groupby("bowler")["total_wickets"]
    .shift(-1)
)

bowler_match[[
    "bowler",
    "total_wickets",
    "target_next_wickets"
]].head(10)

Unnamed: 0,bowler,total_wickets,target_next_wickets
0,A Ashish Reddy,2,1.0
1,A Ashish Reddy,1,1.0
2,A Ashish Reddy,1,1.0
3,A Ashish Reddy,1,1.0
4,A Ashish Reddy,1,2.0
5,A Ashish Reddy,2,0.0
6,A Ashish Reddy,0,0.0
7,A Ashish Reddy,0,3.0
8,A Ashish Reddy,3,1.0
9,A Ashish Reddy,1,1.0


In [36]:
# Drop rows with missing features/target
bowler_final = bowler_match.dropna().reset_index(drop=True)

print("Final bowler dataset shape:", bowler_final.shape)

bowler_final.head()

Final bowler dataset shape: (8994, 13)


Unnamed: 0,match_id,bowler,total_wickets,runs_conceded,balls_bowled,overs,economy,date,rolling_wkts_5,rolling_wkts_10,career_wkts_avg,career_economy,target_next_wickets
0,598004,A Ashish Reddy,1,7,6,1.0,7.0,2013-04-07,1.2,1.2,0.95,9.37553,0.0
1,598048,A Ashish Reddy,0,13,6,1.0,13.0,2013-04-09,1.0,1.1,0.95,9.37553,1.0
2,598013,A Ashish Reddy,1,15,12,2.0,7.5,2013-04-14,1.0,1.0,0.95,9.37553,0.0
3,598030,A Ashish Reddy,0,15,5,0.833333,18.0,2013-04-25,1.2,1.0,0.95,9.37553,1.0
4,829719,A Ashish Reddy,1,11,6,1.0,11.0,2015-04-13,0.6,0.9,0.95,9.37553,0.0


In [37]:
# Save bowler dataset
final_path = "../data/processed/bowler_dataset.csv"

bowler_final.to_csv(final_path, index=False)

print("✅ Bowler dataset saved at:", final_path)

✅ Bowler dataset saved at: ../data/processed/bowler_dataset.csv
