In [4]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)


PROJECT_DIR: c:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction


In [5]:
import pandas as pd
import numpy as np

from src.data_cleaning import run_cleaning_pipeline
from src.feature_engineering import run_feature_engineering
from src.feature_pipeline import save_pipeline
from src.config import PROCESSED_DIR


In [6]:
matches_clean, deliveries_clean = run_cleaning_pipeline()

print("Matches clean:", matches_clean.shape)
print("Deliveries clean:", deliveries_clean.shape)


Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\raw\matches.csv
Shape: (1095, 20)
Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\raw\deliveries.csv
Shape: (260920, 17)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\matches_clean.csv | Shape: (1095, 7)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\deliveries_clean.csv | Shape: (260920, 17)
Matches clean: (1095, 7)
Deliveries clean: (260920, 17)


In [7]:
dataset = run_feature_engineering()
print("Shape:", dataset.shape)
print("Has target_next_runs?:", "target_next_runs" in dataset.columns)
dataset.head()


Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv | Shape: (15842, 17)
Shape: (15842, 17)
Has target_next_runs?: True


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [8]:
print(dataset.columns.tolist())


['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue', 'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg', 'career_runs_avg', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg', 'target_next_runs']


In [9]:
dataset.columns


Index(['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue',
       'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg',
       'career_runs_avg', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg',
       'target_next_runs'],
      dtype='object')

In [10]:
dataset.isnull().sum().sort_values(ascending=False).head(20)


winner              38
match_id             0
batter               0
balls_faced          0
runs                 0
season               0
venue                0
team1                0
date                 0
team2                0
runs_last_5_avg      0
runs_last_10_avg     0
career_runs_avg      0
venue_runs_avg       0
pvt_runs_avg         0
pvp_runs_avg         0
target_next_runs     0
dtype: int64

In [11]:
dataset["date"] = pd.to_datetime(dataset["date"], errors="coerce")
dataset = dataset.dropna(subset=["date"])
dataset = dataset.sort_values("date").reset_index(drop=True)

split_idx = int(len(dataset) * 0.8)

train_df = dataset.iloc[:split_idx].copy()
test_df = dataset.iloc[split_idx:].copy()

print("Train:", train_df.shape)
print("Test:", test_df.shape)
print("Train dates:", train_df["date"].min(), "->", train_df["date"].max())
print("Test dates:", test_df["date"].min(), "->", test_df["date"].max())


Train: (12673, 17)
Test: (3169, 17)
Train dates: 2008-04-18 00:00:00 -> 2021-10-15 00:00:00
Test dates: 2022-03-26 00:00:00 -> 2024-05-24 00:00:00


In [12]:
save_pipeline()


✅ Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\models\feature_pipeline.pkl


In [13]:
top_batters = dataset.groupby("batter")["runs"].sum().sort_values(ascending=False).head(10)
top_batters


batter
V Kohli           7981
S Dhawan          6755
DA Warner         6566
RG Sharma         6562
SK Raina          5533
MS Dhoni          5218
AB de Villiers    5170
CH Gayle          4996
RV Uthappa        4953
KD Karthik        4832
Name: runs, dtype: int64

In [14]:
sample_batter = dataset["batter"].value_counts().index[0]
print("Sample Batter:", sample_batter)

dataset[dataset["batter"] == sample_batter][
    ["date", "match_id", "batter", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg", "venue", "venue_runs_avg"]
].head(20)


Sample Batter: RG Sharma


Unnamed: 0,date,match_id,batter,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue,venue_runs_avg
47,2008-04-20,335986,RG Sharma,0,0.0,0.0,0.0,Eden Gardens,0.0
96,2008-04-22,335988,RG Sharma,66,0.0,0.0,0.0,"Rajiv Gandhi International Stadium, Uppal",0.0
131,2008-04-24,335990,RG Sharma,36,33.0,33.0,33.0,"Rajiv Gandhi International Stadium, Uppal",66.0
274,2008-05-01,335999,RG Sharma,76,34.0,34.0,34.0,"Rajiv Gandhi International Stadium, Uppal",51.0
324,2008-05-03,336034,RG Sharma,57,44.5,44.5,44.5,M Chinnaswamy Stadium,0.0
385,2008-05-06,336007,RG Sharma,23,47.0,47.0,47.0,"MA Chidambaram Stadium, Chepauk",0.0
439,2008-05-09,336011,RG Sharma,5,51.6,43.0,43.0,Sawai Mansingh Stadium,0.0
477,2008-05-11,336014,RG Sharma,33,39.4,37.571429,37.571429,"Rajiv Gandhi International Stadium, Uppal",59.333333
549,2008-05-15,336020,RG Sharma,35,38.8,37.0,37.0,Feroz Shah Kotla,0.0
608,2008-05-18,336024,RG Sharma,6,30.6,36.777778,36.777778,"Rajiv Gandhi International Stadium, Uppal",52.75


In [15]:
sample_df = dataset[dataset["batter"] == sample_batter].copy()
sample_df = sample_df.sort_values("date")

sample_df[["date", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg"]].head(15)


Unnamed: 0,date,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg
47,2008-04-20,0,0.0,0.0,0.0
96,2008-04-22,66,0.0,0.0,0.0
131,2008-04-24,36,33.0,33.0,33.0
274,2008-05-01,76,34.0,34.0,34.0
324,2008-05-03,57,44.5,44.5,44.5
385,2008-05-06,23,47.0,47.0,47.0
439,2008-05-09,5,51.6,43.0,43.0
477,2008-05-11,33,39.4,37.571429,37.571429
549,2008-05-15,35,38.8,37.0,37.0
608,2008-05-18,6,30.6,36.777778,36.777778


In [16]:
import os

print("Processed files:")
print(os.listdir(PROCESSED_DIR))

Processed files:
['bowler_match_features.csv', 'dataset.csv', 'deliveries_clean.csv', 'matches_clean.csv']


In [17]:
final_path = PROCESSED_DIR / "dataset.csv"
print("✅ Final dataset saved at:", final_path)

check_df = pd.read_csv(final_path)
print("Loaded back:", check_df.shape)
check_df.head()

✅ Final dataset saved at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv
Loaded back: (15842, 17)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [18]:
path = PROCESSED_DIR / "dataset.csv"
print("Saved dataset at:", path)

check = pd.read_csv(path)
print("Loaded back:", check.shape)
check.head()


Saved dataset at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv
Loaded back: (15842, 17)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg,pvt_runs_avg,pvp_runs_avg,target_next_runs
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0,0.0,0.0,8.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0,0.0,0.0,10.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0,0.0,0.0,4.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0,0.0,0.0,7.0


In [19]:
# Load cleaned datasets (required for bowling features)

from src.config import PROCESSED_DIR

matches = pd.read_csv(PROCESSED_DIR / "matches_clean.csv")
deliveries = pd.read_csv(PROCESSED_DIR / "deliveries_clean.csv")

print("Matches shape:", matches.shape)
print("Deliveries shape:", deliveries.shape)

matches.head(), deliveries.head()

Matches shape: (1095, 7)
Deliveries shape: (260920, 17)


(       id   season        date                                       venue  \
 0  335982  2007/08  2008-04-18                       M Chinnaswamy Stadium   
 1  335983  2007/08  2008-04-19  Punjab Cricket Association Stadium, Mohali   
 2  335984  2007/08  2008-04-19                            Feroz Shah Kotla   
 3  335985  2007/08  2008-04-20                            Wankhede Stadium   
 4  335986  2007/08  2008-04-20                                Eden Gardens   
 
                          team1                        team2  \
 0  Royal Challengers Bangalore        Kolkata Knight Riders   
 1              Kings XI Punjab          Chennai Super Kings   
 2             Delhi Daredevils             Rajasthan Royals   
 3               Mumbai Indians  Royal Challengers Bangalore   
 4        Kolkata Knight Riders              Deccan Chargers   
 
                         winner  
 0        Kolkata Knight Riders  
 1          Chennai Super Kings  
 2             Delhi Daredevils  
 3

In [20]:
# =========================
# BOWLER MATCH FEATURES
# =========================

# Wickets per ball
deliveries["wicket"] = deliveries["is_wicket"].astype(int)

# Aggregate per match per bowler
bowler_match = (
    deliveries
    .groupby(["match_id", "bowler"], as_index=False)
    .agg(
        wickets=("wicket", "sum"),
        balls=("ball", "count"),
        runs_conceded=("total_runs", "sum")
    )
)

# Merge date + venue
bowler_match = bowler_match.merge(
    matches[["id", "date", "venue"]],
    left_on="match_id",
    right_on="id",
    how="left"
).drop(columns=["id"])

bowler_match = bowler_match.sort_values(["bowler", "date"])

In [21]:
# Rolling bowling averages (NO leakage)

bowler_match["wkts_last_5_avg"] = (
    bowler_match
    .groupby("bowler")["wickets"]
    .transform(lambda x: x.shift(1).rolling(5).mean())
)

bowler_match["wkts_last_10_avg"] = (
    bowler_match
    .groupby("bowler")["wickets"]
    .transform(lambda x: x.shift(1).rolling(10).mean())
)

bowler_match["career_wkts_avg"] = (
    bowler_match
    .groupby("bowler")["wickets"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

bowler_match.fillna(0, inplace=True)

In [22]:
bowler_match["venue_wkts_avg"] = (
    bowler_match
    .groupby(["bowler", "venue"])["wickets"]
    .transform(lambda x: x.shift(1).expanding().mean())
)

bowler_match["venue_wkts_avg"] = bowler_match["venue_wkts_avg"].fillna(0)

In [23]:
# Target = next match wickets

bowler_match["target_next_wickets"] = (
    bowler_match
    .groupby("bowler")["wickets"]
    .shift(-1)
)

bowler_match = bowler_match.dropna(subset=["target_next_wickets"])

In [24]:
bowler_match.to_csv(
    PROCESSED_DIR / "bowler_match_features.csv",
    index=False
)

print("Saved bowler_match_features.csv:", bowler_match.shape)

Saved bowler_match_features.csv: (12448, 12)
