In [10]:
import os
import sys
from pathlib import Path

print("Current Working Directory:", os.getcwd())

# Notebook is inside /notebooks, so project root is one level up
PROJECT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)
print("src exists:", (PROJECT_DIR / "src").exists())
print("data exists:", (PROJECT_DIR / "data").exists())


Current Working Directory: c:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\notebooks
PROJECT_DIR: c:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction
src exists: True
data exists: True


In [11]:
import pandas as pd
import numpy as np

from src.config import RAW_DIR, PROCESSED_DIR
from src.data_cleaning import run_cleaning_pipeline
from src.feature_engineering import run_feature_engineering


In [12]:
matches_clean, deliveries_clean = run_cleaning_pipeline()

print("✅ Matches Clean Shape:", matches_clean.shape)
print("✅ Deliveries Clean Shape:", deliveries_clean.shape)

matches_clean.head()


Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\raw\matches.csv
Shape: (1095, 20)
Loaded: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\raw\deliveries.csv
Shape: (260920, 17)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\matches_clean.csv | Shape: (1095, 10)
Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\deliveries_clean.csv | Shape: (260920, 17)
✅ Matches Clean Shape: (1095, 10)
✅ Deliveries Clean Shape: (260920, 17)


Unnamed: 0,id,season,city,date,venue,team1,team2,toss_winner,toss_decision,winner
0,335982,2007/08,Bangalore,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders
1,335983,2007/08,Chandigarh,2008-04-19,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings
2,335984,2007/08,Delhi,2008-04-19,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils
3,335985,2007/08,Mumbai,2008-04-20,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore
4,335986,2007/08,Kolkata,2008-04-20,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders


In [13]:
print("Matches Clean Columns:\n", matches_clean.columns)
print("\nDeliveries Clean Columns:\n", deliveries_clean.columns)


Matches Clean Columns:
 Index(['id', 'season', 'city', 'date', 'venue', 'team1', 'team2',
       'toss_winner', 'toss_decision', 'winner'],
      dtype='object')

Deliveries Clean Columns:
 Index(['id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')


In [14]:
dataset = run_feature_engineering()

print("✅ Feature Engineered Dataset Shape:", dataset.shape)
dataset.head()


Saved: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\batsman_match_features.csv | Shape: (16515, 14)
✅ Feature Engineered Dataset Shape: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0


In [15]:
dataset.columns


Index(['match_id', 'batter', 'runs', 'balls_faced', 'date', 'season', 'venue',
       'team1', 'team2', 'winner', 'runs_last_5_avg', 'runs_last_10_avg',
       'career_runs_avg', 'venue_runs_avg'],
      dtype='object')

In [16]:
dataset.isnull().sum().sort_values(ascending=False).head(20)


winner              40
match_id             0
runs                 0
balls_faced          0
date                 0
batter               0
season               0
venue                0
team1                0
team2                0
runs_last_5_avg      0
runs_last_10_avg     0
career_runs_avg      0
venue_runs_avg       0
dtype: int64

In [17]:
dataset.describe().T


Unnamed: 0,count,mean,min,25%,50%,75%,max,std
match_id,16515.0,907862.594308,335982.0,548331.0,980971.0,1254071.0,1426312.0,369668.604422
runs,16515.0,19.98571,0.0,4.0,13.0,29.0,175.0,21.360857
balls_faced,16515.0,15.798971,1.0,5.0,11.0,23.0,77.0,13.934583
date,16515.0,2016-07-13 12:29:15.204359424,2008-04-18 00:00:00,2012-04-21 00:00:00,2016-05-07 00:00:00,2021-04-21 00:00:00,2024-05-26 00:00:00,
runs_last_5_avg,16515.0,19.683006,0.0,9.666667,18.2,27.8,158.0,13.118083
runs_last_10_avg,16515.0,19.69968,0.0,11.0,19.5,27.3,158.0,11.7588
career_runs_avg,16515.0,19.356292,0.0,12.571429,20.641256,26.615385,158.0,10.294709
venue_runs_avg,16515.0,13.57904,0.0,0.0,7.0,23.732143,158.0,16.723585


In [18]:
top_batters = dataset.groupby("batter")["runs"].sum().sort_values(ascending=False).head(10)
top_batters


batter
V Kohli           8014
S Dhawan          6769
RG Sharma         6630
DA Warner         6567
SK Raina          5536
MS Dhoni          5243
AB de Villiers    5181
CH Gayle          4997
RV Uthappa        4954
KD Karthik        4843
Name: runs, dtype: int64

In [19]:
sample_batter = dataset["batter"].value_counts().index[0]
print("Sample Batter:", sample_batter)

dataset[dataset["batter"] == sample_batter][
    ["date", "match_id", "batter", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg", "venue", "venue_runs_avg"]
].head(20)


Sample Batter: RG Sharma


Unnamed: 0,date,match_id,batter,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue,venue_runs_avg
11421,2008-04-20,335986,RG Sharma,0,0.0,0.0,0.0,Eden Gardens,0.0
11422,2008-04-22,335988,RG Sharma,66,0.0,0.0,0.0,"Rajiv Gandhi International Stadium, Uppal",0.0
11423,2008-04-24,335990,RG Sharma,36,33.0,33.0,33.0,"Rajiv Gandhi International Stadium, Uppal",66.0
11424,2008-05-01,335999,RG Sharma,76,34.0,34.0,34.0,"Rajiv Gandhi International Stadium, Uppal",51.0
11425,2008-05-03,336034,RG Sharma,57,44.5,44.5,44.5,M Chinnaswamy Stadium,0.0
11426,2008-05-06,336007,RG Sharma,23,47.0,47.0,47.0,"MA Chidambaram Stadium, Chepauk",0.0
11427,2008-05-09,336011,RG Sharma,5,51.6,43.0,43.0,Sawai Mansingh Stadium,0.0
11428,2008-05-11,336014,RG Sharma,33,39.4,37.571429,37.571429,"Rajiv Gandhi International Stadium, Uppal",59.333333
11429,2008-05-15,336020,RG Sharma,35,38.8,37.0,37.0,Feroz Shah Kotla,0.0
11430,2008-05-18,336024,RG Sharma,6,30.6,36.777778,36.777778,"Rajiv Gandhi International Stadium, Uppal",52.75


In [20]:
sample_df = dataset[dataset["batter"] == sample_batter].copy()
sample_df = sample_df.sort_values("date")

sample_df[["date", "runs", "runs_last_5_avg", "runs_last_10_avg", "career_runs_avg"]].head(15)


Unnamed: 0,date,runs,runs_last_5_avg,runs_last_10_avg,career_runs_avg
11421,2008-04-20,0,0.0,0.0,0.0
11422,2008-04-22,66,0.0,0.0,0.0
11423,2008-04-24,36,33.0,33.0,33.0
11424,2008-05-01,76,34.0,34.0,34.0
11425,2008-05-03,57,44.5,44.5,44.5
11426,2008-05-06,23,47.0,47.0,47.0
11427,2008-05-09,5,51.6,43.0,43.0
11428,2008-05-11,33,39.4,37.571429,37.571429
11429,2008-05-15,35,38.8,37.0,37.0
11430,2008-05-18,6,30.6,36.777778,36.777778


In [21]:
final_path = PROCESSED_DIR / "batsman_match_features.csv"
print("✅ Final dataset saved at:", final_path)

# Load again to confirm file exists
check_df = pd.read_csv(final_path)
print("Loaded back:", check_df.shape)
check_df.head()


✅ Final dataset saved at: C:\Users\Abhishek Karyagol\OneDrive\Tài liệu\infosys\AI_Cricket_Player_Performance_Prediction\data\processed\batsman_match_features.csv
Loaded back: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0
