In [48]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)


PROJECT_DIR: c:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction


In [50]:
import pandas as pd
import numpy as np

from src.data_cleaning import run_cleaning_pipeline
from src.feature_engineering import run_feature_engineering
from src.feature_pipeline import save_pipeline
from src.config import PROCESSED_DIR


In [51]:
matches_clean, deliveries_clean = run_cleaning_pipeline()

print("Matches clean:", matches_clean.shape)
print("Deliveries clean:", deliveries_clean.shape)


Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\raw\matches.csv
Shape: (1095, 20)
Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\raw\deliveries.csv
Shape: (260920, 17)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\matches_clean.csv | Shape: (1095, 10)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\deliveries_clean.csv | Shape: (260920, 17)
Matches clean: (1095, 10)
Deliveries clean: (260920, 17)


In [None]:
dataset = run_feature_engineering()
print("Shape:", dataset.shape)
print("Has target_next_runs?:", "target_next_runs" in dataset.columns)
dataset.head()


Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\batsman_match_features.csv | Shape: (16515, 14)
Final dataset: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0


In [53]:
print(dataset.columns.tolist())


['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue', 'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg', 'career_runs_avg', 'venue_runs_avg']


In [54]:
dataset.columns


Index(['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue',
       'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg',
       'career_runs_avg', 'venue_runs_avg'],
      dtype='object')

In [55]:
dataset.isnull().sum().sort_values(ascending=False).head(20)


winner              40
match_id             0
runs                 0
balls_faced          0
date                 0
batter               0
season               0
venue                0
team1                0
team2                0
runs_last_5_avg      0
runs_last_10_avg     0
career_runs_avg      0
venue_runs_avg       0
dtype: int64

In [39]:
dataset["date"] = pd.to_datetime(dataset["date"], errors="coerce")
dataset = dataset.dropna(subset=["date"])
dataset = dataset.sort_values("date").reset_index(drop=True)

split_idx = int(len(dataset) * 0.8)

train_df = dataset.iloc[:split_idx].copy()
test_df = dataset.iloc[split_idx:].copy()

print("Train:", train_df.shape)
print("Test:", test_df.shape)
print("Train dates:", train_df["date"].min(), "->", train_df["date"].max())
print("Test dates:", test_df["date"].min(), "->", test_df["date"].max())


Train: (13212, 14)
Test: (3303, 14)
Train dates: 2008-04-18 00:00:00 -> 2022-04-02 00:00:00
Test dates: 2022-04-02 00:00:00 -> 2024-05-26 00:00:00


In [40]:
save_pipeline()


✅ Saved feature_pipeline.pkl at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\models\feature_pipeline.pkl


In [41]:
top_batters = dataset.groupby("batter")["runs"].sum().sort_values(ascending=False).head(10)
top_batters


batter
V Kohli           8014
S Dhawan          6769
RG Sharma         6630
DA Warner         6567
SK Raina          5536
MS Dhoni          5243
AB de Villiers    5181
CH Gayle          4997
RV Uthappa        4954
KD Karthik        4843
Name: runs, dtype: int64

In [43]:
sample_batter = dataset["batter"].value_counts().index[0]
print("Sample Batter:", sample_batter)

dataset[dataset["batter"] == sample_batter][
    ["date", "match_id", "batter", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg", "venue", "venue_runs_avg"]
].head(20)


Sample Batter: RG Sharma


Unnamed: 0,date,match_id,batter,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue,venue_runs_avg
52,2008-04-20,335986,RG Sharma,0,0.0,0.0,0.0,Eden Gardens,0.0
97,2008-04-22,335988,RG Sharma,66,0.0,0.0,0.0,"Rajiv Gandhi International Stadium, Uppal",0.0
120,2008-04-24,335990,RG Sharma,36,33.0,33.0,33.0,"Rajiv Gandhi International Stadium, Uppal",66.0
282,2008-05-01,335999,RG Sharma,76,34.0,34.0,34.0,"Rajiv Gandhi International Stadium, Uppal",51.0
320,2008-05-03,336034,RG Sharma,57,44.5,44.5,44.5,M Chinnaswamy Stadium,0.0
398,2008-05-06,336007,RG Sharma,23,47.0,47.0,47.0,"MA Chidambaram Stadium, Chepauk",0.0
455,2008-05-09,336011,RG Sharma,5,51.6,43.0,43.0,Sawai Mansingh Stadium,0.0
486,2008-05-11,336014,RG Sharma,33,39.4,37.571429,37.571429,"Rajiv Gandhi International Stadium, Uppal",59.333333
549,2008-05-15,336020,RG Sharma,35,38.8,37.0,37.0,Feroz Shah Kotla,0.0
620,2008-05-18,336024,RG Sharma,6,30.6,36.777778,36.777778,"Rajiv Gandhi International Stadium, Uppal",52.75


In [44]:
sample_df = dataset[dataset["batter"] == sample_batter].copy()
sample_df = sample_df.sort_values("date")

sample_df[["date", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg"]].head(15)


Unnamed: 0,date,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg
52,2008-04-20,0,0.0,0.0,0.0
97,2008-04-22,66,0.0,0.0,0.0
120,2008-04-24,36,33.0,33.0,33.0
282,2008-05-01,76,34.0,34.0,34.0
320,2008-05-03,57,44.5,44.5,44.5
398,2008-05-06,23,47.0,47.0,47.0
455,2008-05-09,5,51.6,43.0,43.0
486,2008-05-11,33,39.4,37.571429,37.571429
549,2008-05-15,35,38.8,37.0,37.0
620,2008-05-18,6,30.6,36.777778,36.777778


In [45]:
final_path = PROCESSED_DIR / "batsman_match_features.csv"
print("✅ Final dataset saved at:", final_path)

# Load again to confirm file exists
check_df = pd.read_csv(final_path)
print("Loaded back:", check_df.shape)
check_df.head()


✅ Final dataset saved at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\batsman_match_features.csv
Loaded back: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0


In [46]:
path = PROCESSED_DIR / "dataset.csv"
print("Saved dataset at:", path)

check = pd.read_csv(path)
print("Loaded back:", check.shape)
check.head()


Saved dataset at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\dataset.csv
Loaded back: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0
