In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [7]:
%run ../scripts/data_cleaning.py

Raw data loaded successfully
Cleaned data saved successfully at ../data/processed/ipl_cleaned.csv ✅


In [8]:
import pandas as pd
df = pd.read_csv("../data/processed/ipl_cleaned.csv")
df['date'] = pd.to_datetime(df['date'])
print("Cleaned data loaded successfully ✅")
df.head()

Cleaned data loaded successfully ✅


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder,date,venue,team1,team2
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,2017-05-04,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,2017-05-04,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,4,0,4,0,0,0,2017-05-04,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,2017-05-04,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,2,2,0,0,0,2017-05-04,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore


In [33]:
player_df = (
    df.groupby(
        ['match_id', 'date', 'batsman', 'venue',
         'batting_team', 'team1', 'team2']
    )
    .agg(
        runs=('batsman_runs', 'sum'),
        balls=('batsman_runs', 'count'),
        wickets=('player_dismissed', lambda x: (x != 0).sum())
    )
    .reset_index()
)

player_df.head()#aggregate ball by ball

Unnamed: 0,match_id,date,batsman,venue,batting_team,team1,team2,runs,balls,wickets
0,1,2017-05-04,A Choudhary,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,Sunrisers Hyderabad,Royal Challengers Bangalore,6,2,2
1,1,2017-05-04,BCJ Cutting,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,16,6,6
2,1,2017-05-04,CH Gayle,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,Sunrisers Hyderabad,Royal Challengers Bangalore,32,23,23
3,1,2017-05-04,DA Warner,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,14,9,9
4,1,2017-05-04,DJ Hooda,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,16,12,12


In [61]:
# Sort data chronologically (VERY IMPORTANT)

player_match_df = player_match_df.sort_values(
    by=['batsman', 'date']
).reset_index(drop=True)

print("Data sorted chronologically ")
player_match_df.head()

Data sorted chronologically 


Unnamed: 0,match_id,date,venue,batsman,batting_team,total_runs,balls_faced,wickets
0,352,2012-04-05,"MA Chidambaram Stadium, Chepauk",A Ashish Reddy,Deccan Chargers,3,3,3
1,359,2012-08-05,"Rajiv Gandhi International Stadium, Uppal",A Ashish Reddy,Deccan Chargers,8,8,8
2,428,2013-04-05,"Rajiv Gandhi International Stadium, Uppal",A Ashish Reddy,Sunrisers Hyderabad,5,7,7
3,384,2013-05-04,"Rajiv Gandhi International Stadium, Uppal",A Ashish Reddy,Sunrisers Hyderabad,7,4,4
4,388,2013-07-04,"Rajiv Gandhi International Stadium, Uppal",A Ashish Reddy,Sunrisers Hyderabad,14,12,12


In [71]:
top_batters = (
    player_df
    .groupby('batsman')['runs']
    .sum()
    .sort_values(ascending=False)
    .head(10)
)

top_batters

batsman
RG Sharma         1898
SK Raina          1830
V Kohli           1765
S Dhawan          1721
RV Uthappa        1618
CH Gayle          1581
AB de Villiers    1581
MS Dhoni          1527
G Gambhir         1492
DA Warner         1468
Name: runs, dtype: int64

In [73]:
sample_batter = player_df['batsman'].value_counts().index[0]
print("Sample batter:", sample_batter)

Sample batter: RG Sharma


In [75]:
sample_df = player_df[player_df['batsman'] == sample_batter]

In [77]:
print(sample_df.columns.tolist())

['match_id', 'date', 'batsman', 'venue', 'batting_team', 'team1', 'team2', 'runs', 'balls', 'wickets', 'venue_avg_runs', 'opponent', 'avg_runs_vs_opponent', 'target_runs', 'target_wickets', 'career_runs_x', 'career_avg_runs_x', 'career_wickets_x', 'career_runs_y', 'career_avg_runs_y', 'career_wickets_y', 'career_runs', 'career_avg_runs', 'career_wickets']


In [78]:
sample_df[
    [
        'date',
        'match_id',
        'batsman',
        'runs',
        'venue',
        'venue_avg_runs',
        'avg_runs_vs_opponent'
    ]
].head(10)

Unnamed: 0,date,match_id,batsman,runs,venue,venue_avg_runs,avg_runs_vs_opponent
2648,2008-01-05,77,RG Sharma,76,"Rajiv Gandhi International Stadium, Uppal",37.166667,22.444444
2649,2008-03-05,111,RG Sharma,57,M Chinnaswamy Stadium,19.8,28.0
2650,2008-06-05,85,RG Sharma,23,"MA Chidambaram Stadium, Chepauk",12.25,18.846154
2651,2008-09-05,89,RG Sharma,5,Sawai Mansingh Stadium,5.0,28.833333
2652,2008-11-05,92,RG Sharma,33,"Rajiv Gandhi International Stadium, Uppal",37.166667,37.8
2653,2009-02-05,140,RG Sharma,38,St George's Park,38.0,28.833333
2654,2009-04-05,144,RG Sharma,21,Buffalo Park,21.0,18.846154
2655,2009-06-05,147,RG Sharma,38,SuperSport Park,38.0,24.5
2656,2009-09-05,151,RG Sharma,9,De Beers Diamond Oval,9.0,22.444444
2657,2009-11-05,155,RG Sharma,9,De Beers Diamond Oval,9.0,28.833333


In [83]:
# Filter one sample batter
sample_batter = 'RG Sharma'

sample_df = player_df[player_df['batsman'] == sample_batter].copy()

# Sort chronologically
sample_df = sample_df.sort_values('date')

# Columns we WANT to see
desired_cols = [
    'date',
    'runs',
    'venue_avg_runs',
    'avg_runs_vs_opp'
]

# Select only columns that actually exist
existing_cols = [c for c in desired_cols if c in sample_df.columns]

print("Showing columns:", existing_cols)

sample_df[existing_cols].head(15)

Showing columns: ['date', 'runs', 'venue_avg_runs']


Unnamed: 0,date,runs,venue_avg_runs
2648,2008-01-05,76,37.166667
2649,2008-03-05,57,19.8
2650,2008-06-05,23,12.25
2651,2008-09-05,5,5.0
2652,2008-11-05,33,37.166667
2653,2009-02-05,38,38.0
2654,2009-04-05,21,21.0
2655,2009-06-05,38,38.0
2656,2009-09-05,9,9.0
2657,2009-11-05,9,9.0


In [100]:
from pathlib import Path
import pandas as pd

# Define processed directory path
PROCESSED_DIR = Path("../data/processed")

# Create directory if it does not exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [103]:
# Save final feature-engineered dataset
final_path = PROCESSED_DIR / "batsman_match_features.csv"
print("Final dataset saved at:", final_path)
player_df.to_csv(final_path, index=False)
# Load again to confirm file exists
check_df = pd.read_csv(final_path)
print("Loaded back:", check_df.shape)
check_df.head()

Final dataset saved at: ..\data\processed\batsman_match_features.csv
Loaded back: (2684, 28)


Unnamed: 0,match_id,date,batsman,venue,batting_team,team1,team2,runs,balls,wickets,...,career_runs_y,career_avg_runs_y,career_wickets_y,career_runs,career_avg_runs,career_wickets,recent_runs_5,recent_wickets_5,venue_avg_runs_y,avg_runs_vs_opponent_y
0,388,2013-07-04,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,Sunrisers Hyderabad,14,12,12,...,62,7.75,53,62,7.75,53,7.4,6.8,8.0,8.5
1,432,2013-09-04,A Ashish Reddy,M Chinnaswamy Stadium,Sunrisers Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,3,4,4,...,62,7.75,53,62,7.75,53,7.4,7.0,3.0,8.5
2,394,2013-12-04,A Ashish Reddy,Feroz Shah Kotla,Sunrisers Hyderabad,Delhi Daredevils,Sunrisers Hyderabad,16,9,9,...,62,7.75,53,62,7.75,53,9.0,7.2,16.0,10.5
3,551,2015-02-05,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Sunrisers Hyderabad,Chennai Super Kings,6,6,6,...,62,7.75,53,62,7.75,53,9.2,7.0,8.0,4.5
4,209,2010-04-04,A Kumble,Feroz Shah Kotla,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,1,1,1,...,12,2.4,19,12,2.4,19,2.4,3.8,1.0,1.0


In [101]:
if not any(col.startswith("career_") for col in player_df.columns):
    career_stats = (
        player_df.groupby('batsman')
        .agg(
            career_runs=('runs', 'sum'),
            career_avg_runs=('runs', 'mean'),
            career_wickets=('wickets', 'sum')
        )
        .reset_index()
    )

    player_df = player_df.merge(career_stats, on='batsman', how='left')
    print("Career stats merged ")
else:
    print("Career stats already exist — merge skipped ")


Career stats already exist — merge skipped 


In [85]:
#rolling averages 
player_df = player_df.sort_values(['batsman', 'date'])

player_df['recent_runs_5'] = (
    player_df.groupby('batsman')['runs']
    .rolling(window=5)
    .mean()
    .reset_index(level=0, drop=True)
)

player_df['recent_wickets_5'] = (
    player_df.groupby('batsman')['wickets']
    .rolling(window=5)
    .mean()
    .reset_index(level=0, drop=True)
)

In [86]:
#venue wise 
venue_avg = (
    player_df.groupby(['batsman', 'venue'])['runs']
    .mean()
    .reset_index()
    .rename(columns={'runs': 'venue_avg_runs'})
)

player_df = player_df.merge(
    venue_avg,
    on=['batsman', 'venue'],
    how='left'
)


In [87]:
#opponent-specific 
player_df['opponent'] = player_df.apply(
    lambda x: x['team2'] if x['batting_team'] == x['team1'] else x['team1'],
    axis=1
)

opponent_avg = (
    player_df.groupby(['batsman', 'opponent'])['runs']
    .mean()
    .reset_index()
    .rename(columns={'runs': 'avg_runs_vs_opponent'})
)

player_df = player_df.merge(
    opponent_avg,
    on=['batsman', 'opponent'],
    how='left'
)


In [38]:
#creating traing labels
player_df = player_df.sort_values(['batsman', 'date'])

player_df['target_runs'] = (
    player_df.groupby('batsman')['runs'].shift(-1)
)

player_df['target_wickets'] = (
    player_df.groupby('batsman')['wickets'].shift(-1)
)


In [88]:
#final cleanup and save data
player_df = player_df.dropna().reset_index(drop=True)

player_df.to_csv("../data/processed/dataset.csv", index=False)

print("Final feature-engineered dataset saved ")


Final feature-engineered dataset saved 


In [89]:
print(player_df.columns.tolist())

['match_id', 'date', 'batsman', 'venue', 'batting_team', 'team1', 'team2', 'runs', 'balls', 'wickets', 'venue_avg_runs_x', 'opponent', 'avg_runs_vs_opponent_x', 'target_runs', 'target_wickets', 'career_runs_x', 'career_avg_runs_x', 'career_wickets_x', 'career_runs_y', 'career_avg_runs_y', 'career_wickets_y', 'career_runs', 'career_avg_runs', 'career_wickets', 'recent_runs_5', 'recent_wickets_5', 'venue_avg_runs_y', 'avg_runs_vs_opponent_y']


In [90]:
# Auto-detect engineered feature columns
feature_cols = [
    col for col in player_df.columns
    if col.startswith(('career', 'runs_last', 'runs_rolling', 'venue', 'vs_team', 'opponent'))
]

print("Using features:", feature_cols)

X = player_df[feature_cols]
y_runs = player_df['target_runs']
y_wickets = player_df['target_wickets']

Using features: ['venue', 'venue_avg_runs_x', 'opponent', 'career_runs_x', 'career_avg_runs_x', 'career_wickets_x', 'career_runs_y', 'career_avg_runs_y', 'career_wickets_y', 'career_runs', 'career_avg_runs', 'career_wickets', 'venue_avg_runs_y']


In [91]:
# Time-series split (80% train, 20% test)
split = int(len(player_df) * 0.8)

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_runs_train, y_runs_test = y_runs.iloc[:split], y_runs.iloc[split:]
y_wkts_train, y_wkts_test = y_wickets.iloc[:split], y_wickets.iloc[split:]

print("Time-series split completed ")


Time-series split completed 


In [92]:
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_runs_train:", y_runs_train.shape)
print("y_wkts_train:", y_wkts_train.shape)

X_train: (2147, 13)
X_test : (537, 13)
y_runs_train: (2147,)
y_wkts_train: (2147,)


In [93]:
print(player_df.columns.tolist())

['match_id', 'date', 'batsman', 'venue', 'batting_team', 'team1', 'team2', 'runs', 'balls', 'wickets', 'venue_avg_runs_x', 'opponent', 'avg_runs_vs_opponent_x', 'target_runs', 'target_wickets', 'career_runs_x', 'career_avg_runs_x', 'career_wickets_x', 'career_runs_y', 'career_avg_runs_y', 'career_wickets_y', 'career_runs', 'career_avg_runs', 'career_wickets', 'recent_runs_5', 'recent_wickets_5', 'venue_avg_runs_y', 'avg_runs_vs_opponent_y']


In [94]:
features = [
    'career_runs_avg',
    'runs_rolling_5',
    'venue_runs_avg',
    'opponent_runs_avg'
]

In [95]:
features = [
    'avg_career_runs',
    'last_5_runs',
    'venue_avg',
    'vs_team_avg'
]

In [96]:
# FINAL FEATURE LIST (freeze names here)
final_features = [
    'career_runs_avg',
    'runs_rolling_5',
    'venue_runs_avg',
    'opponent_runs_avg'
]

# Ensure features exist
final_features = [f for f in final_features if f in player_df.columns]

print("Final features used:", final_features)

X = player_df[final_features]
y_runs = player_df['target_runs']
y_wickets = player_df['target_wickets']
print("Final feature matrix ready ")

Final features used: []
Final feature matrix ready 


In [98]:
import os
import pickle
os.makedirs("../models", exist_ok=True)
with open("../models/feature_pipeline.pkl", "wb") as f:
    pickle.dump(features, f)

print("Feature pipeline saved successfully ")

Feature pipeline saved successfully 
