In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
import joblib


In [2]:
# Load cleaned datasets
deliveries = pd.read_csv("../data/cleaned/cleaned_deliveries.csv")
matches = pd.read_csv("../data/cleaned/cleaned_matches.csv")

deliveries.head(), matches.head()


(   match_id  inning           batting_team                 bowling_team  over  \
 0    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
 1    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
 2    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
 3    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
 4    335982       1  Kolkata Knight Riders  Royal Challengers Bangalore     0   
 
    ball       batter   bowler  non_striker  batsman_runs  extra_runs  \
 0     1   SC Ganguly  P Kumar  BB McCullum             0           1   
 1     2  BB McCullum  P Kumar   SC Ganguly             0           0   
 2     3  BB McCullum  P Kumar   SC Ganguly             0           1   
 3     4  BB McCullum  P Kumar   SC Ganguly             0           0   
 4     5  BB McCullum  P Kumar   SC Ganguly             0           0   
 
    total_runs extras_type  is_wicket player_dismissed dismissal_k

In [13]:
player_match = deliveries.groupby(
    ['match_id', 'batter', 'bowler'],
    as_index=False
).agg(
    runs=('batsman_runs', 'sum'),
    balls=('ball', 'count'),
    wickets=('is_wicket', 'sum')
)

player_match = player_match.merge(
    matches[['id', 'date', 'venue']],
    left_on='match_id',
    right_on='id',
    how='left'
).drop(columns='id')

player_match['strike_rate'] = (player_match['runs'] / player_match['balls']) * 100
player_match.head()

Unnamed: 0,match_id,batter,bowler,runs,balls,wickets,date,venue,strike_rate
0,335982,AA Noffke,AB Agarkar,2,8,0,2008-04-18,M Chinnaswamy Stadium,25.0
1,335982,AA Noffke,SC Ganguly,7,4,1,2008-04-18,M Chinnaswamy Stadium,175.0
2,335982,B Akhil,AB Agarkar,0,2,1,2008-04-18,M Chinnaswamy Stadium,0.0
3,335982,BB McCullum,AA Noffke,24,13,0,2008-04-18,M Chinnaswamy Stadium,184.615385
4,335982,BB McCullum,CL White,16,4,0,2008-04-18,M Chinnaswamy Stadium,400.0


In [14]:
player_match = player_match.sort_values(['batter', 'date'])

player_match['rolling_runs_5'] = (
    player_match.groupby('batter')['runs']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

player_match['rolling_sr_5'] = (
    player_match.groupby('batter')['strike_rate']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)
print(player_match.head())

       match_id          batter         bowler  runs  balls  wickets  \
14036    548346  A Ashish Reddy   JEC Franklin     3      3        0   
14037    548346  A Ashish Reddy       MM Patel     0      2        1   
14038    548346  A Ashish Reddy    RJ Peterson     7      5        0   
14333    548352  A Ashish Reddy  BW Hilfenhaus     2      2        1   
14334    548352  A Ashish Reddy       DJ Bravo     1      1        0   

             date                            venue  strike_rate  \
14036  2012-04-29                 Wankhede Stadium        100.0   
14037  2012-04-29                 Wankhede Stadium          0.0   
14038  2012-04-29                 Wankhede Stadium        140.0   
14333  2012-05-04  MA Chidambaram Stadium, Chepauk        100.0   
14334  2012-05-04  MA Chidambaram Stadium, Chepauk        100.0   

       rolling_runs_5  rolling_sr_5  
14036        3.000000         100.0  
14037        1.500000          50.0  
14038        3.333333          80.0  
14333       

In [15]:
venue_stats = player_match.groupby(
    ['venue', 'batter'],
    as_index=False
).agg(
    venue_avg_runs=('runs', 'mean'),
    venue_avg_sr=('strike_rate', 'mean')
)

player_match = player_match.merge(
    venue_stats,
    on=['venue', 'batter'],
    how='left'
)
player_match = player_match.drop(columns=['venue'])
print(player_match.head())


   match_id          batter         bowler  runs  balls  wickets        date  \
0    548346  A Ashish Reddy   JEC Franklin     3      3        0  2012-04-29   
1    548346  A Ashish Reddy       MM Patel     0      2        1  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson     7      5        0  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus     2      2        1  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo     1      1        0  2012-05-04   

   strike_rate  rolling_runs_5  rolling_sr_5  venue_avg_runs  venue_avg_sr  
0        100.0        3.000000         100.0        3.333333          80.0  
1          0.0        1.500000          50.0        3.333333          80.0  
2        140.0        3.333333          80.0        3.333333          80.0  
3        100.0        3.000000          85.0        7.800000         198.0  
4        100.0        2.600000          88.0        7.800000         198.0  


In [16]:
pvp = deliveries.groupby(
    ['batter', 'bowler'],
    as_index=False
).agg(
    pvp_runs=('batsman_runs', 'sum'),
    pvp_balls=('ball', 'count'),
    pvp_wickets=('is_wicket', 'sum')
)

pvp['pvp_sr'] = (pvp['pvp_runs'] / pvp['pvp_balls']) * 100

player_match = player_match.merge(
    pvp,
    on=['batter', 'bowler'],
    how='left'
)
player_match = player_match.dropna().reset_index(drop=True)
print(player_match.head())


   match_id          batter         bowler  runs  balls  wickets        date  \
0    548346  A Ashish Reddy   JEC Franklin     3      3        0  2012-04-29   
1    548346  A Ashish Reddy       MM Patel     0      2        1  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson     7      5        0  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus     2      2        1  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo     1      1        0  2012-05-04   

   strike_rate  rolling_runs_5  rolling_sr_5  venue_avg_runs  venue_avg_sr  \
0        100.0        3.000000         100.0        3.333333          80.0   
1          0.0        1.500000          50.0        3.333333          80.0   
2        140.0        3.333333          80.0        3.333333          80.0   
3        100.0        3.000000          85.0        7.800000         198.0   
4        100.0        2.600000          88.0        7.800000         198.0   

   pvp_runs  pvp_balls  pvp_wickets      pvp_sr  


In [17]:
career = player_match.groupby(
    'batter',
    as_index=False
).agg(
    career_runs=('runs', 'sum'),
    career_matches=('match_id', 'nunique'),
    career_avg_runs=('runs', 'mean'),
    career_avg_sr=('strike_rate', 'mean')
)

player_match = player_match.merge(
    career,
    on='batter',
    how='left'
)
player_match = player_match.dropna().reset_index(drop=True)
print(player_match.head())  


   match_id          batter         bowler  runs  balls  wickets        date  \
0    548346  A Ashish Reddy   JEC Franklin     3      3        0  2012-04-29   
1    548346  A Ashish Reddy       MM Patel     0      2        1  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson     7      5        0  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus     2      2        1  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo     1      1        0  2012-05-04   

   strike_rate  rolling_runs_5  rolling_sr_5  venue_avg_runs  venue_avg_sr  \
0        100.0        3.000000         100.0        3.333333          80.0   
1          0.0        1.500000          50.0        3.333333          80.0   
2        140.0        3.333333          80.0        3.333333          80.0   
3        100.0        3.000000          85.0        7.800000         198.0   
4        100.0        2.600000          88.0        7.800000         198.0   

   pvp_runs  pvp_balls  pvp_wickets      pvp_sr  c

In [18]:
player_match = player_match.sort_values(['batter', 'date'])

player_match['next_match_runs'] = (
    player_match.groupby('batter')['runs']
    .shift(-1)
)

player_match = player_match.dropna(subset=['next_match_runs'])
feature_cols = [
    'runs', 'balls', 'wickets', 'strike_rate',
    'rolling_runs_5', 'rolling_sr_5',
    'venue_avg_runs', 'venue_avg_sr',
    'pvp_runs', 'pvp_balls', 'pvp_wickets', 'pvp_sr',
    'career_runs', 'career_matches', 'career_avg_runs', 'career_avg_sr'
]
X = player_match[feature_cols]
y = player_match['next_match_runs']
X.head(), y.head()
# Save the feature set and target variable



(   runs  balls  wickets  strike_rate  rolling_runs_5  rolling_sr_5  \
 0     3      3        0        100.0        3.000000         100.0   
 1     0      2        1          0.0        1.500000          50.0   
 2     7      5        0        140.0        3.333333          80.0   
 3     2      2        1        100.0        3.000000          85.0   
 4     1      1        0        100.0        2.600000          88.0   
 
    venue_avg_runs  venue_avg_sr  pvp_runs  pvp_balls  pvp_wickets      pvp_sr  \
 0        3.333333          80.0         3          3            0  100.000000   
 1        3.333333          80.0         0          2            1    0.000000   
 2        3.333333          80.0         7          5            0  140.000000   
 3        7.800000         198.0         2          2            1  100.000000   
 4        7.800000         198.0        24         13            1  184.615385   
 
    career_runs  career_matches  career_avg_runs  career_avg_sr  
 0          

In [20]:
FEATURES = [
    'rolling_runs_5',
    'rolling_sr_5',
    'venue_avg_runs',
    'venue_avg_sr',
    'pvp_runs',
    'pvp_wickets',
    'career_avg_runs',
    'career_avg_sr'
]

X = player_match[FEATURES]
y = player_match['next_match_runs']
pipeline = Pipeline([
    ('scaler', StandardScaler())
])
X_scaled = pipeline.fit_transform(X)



In [10]:
tscv = TimeSeriesSplit(n_splits=5)

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


In [11]:
feature_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train_scaled = feature_pipeline.fit_transform(X_train)
X_test_scaled = feature_pipeline.transform(X_test)


In [12]:
# Save final feature engineered dataset
player_match.to_csv("../data/cleaned/dataset.csv", index=False)

# Save preprocessing pipeline
joblib.dump(feature_pipeline, "../data/cleaned/feature_pipeline.pkl")


['../data/cleaned/feature_pipeline.pkl']