In [3]:
import pandas as pd
import numpy as np

In [4]:
matches = pd.read_csv("../data/raw/matches.csv")
deliveries = pd.read_csv("../data/raw/deliveries.csv")

In [5]:
selected_data = deliveries[
    ['match_id', 'batter', 'bowler', 'batsman_runs', 'is_wicket']
]

In [6]:
player_match_stats = selected_data.groupby(
    ['match_id', 'batter']
).agg(
    runs_scored=('batsman_runs', 'sum'),
    balls_faced=('batsman_runs', 'count'),
    wickets_lost=('is_wicket', 'sum')
).reset_index()

In [7]:
player_match_stats.head()

Unnamed: 0,match_id,batter,runs_scored,balls_faced,wickets_lost
0,335982,AA Noffke,9,12,1
1,335982,B Akhil,0,2,1
2,335982,BB McCullum,158,77,0
3,335982,CL White,6,10,1
4,335982,DJ Hussey,12,12,1


In [8]:
player_match_stats = player_match_stats.sort_values(
    ['batter', 'match_id']
)

In [10]:
import os

os.makedirs("data/processed", exist_ok=True)

In [11]:
player_match_stats.to_csv(
    "data/processed/player_match_features.csv",
    index=False
)

In [12]:
player_match_stats.to_csv("data/processed/player_match_features.csv", index=False)

In [13]:
# Rolling features for batting form
player_match_stats['avg_runs_last_5'] = (
    player_match_stats
    .groupby('batter')['runs_scored']
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

player_match_stats['avg_balls_last_5'] = (
    player_match_stats
    .groupby('batter')['balls_faced']
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

In [14]:
player_match_stats.head()

Unnamed: 0,match_id,batter,runs_scored,balls_faced,wickets_lost,avg_runs_last_5,avg_balls_last_5
4299,548346,A Ashish Reddy,10,10,1,10.0,10.0
4390,548352,A Ashish Reddy,3,3,1,6.5,6.5
4496,548359,A Ashish Reddy,8,8,1,7.0,7.0
4699,548373,A Ashish Reddy,10,4,0,7.75,6.25
4747,548376,A Ashish Reddy,4,5,1,7.0,6.0


In [15]:
player_match_stats['next_match_runs'] = (
    player_match_stats
    .groupby('batter')['runs_scored']
    .shift(-1)
)

In [16]:
player_match_stats[['batter', 'runs_scored', 'next_match_runs']].head(10)

Unnamed: 0,batter,runs_scored,next_match_runs
4299,A Ashish Reddy,10,3.0
4390,A Ashish Reddy,3,8.0
4496,A Ashish Reddy,8,10.0
4699,A Ashish Reddy,10,4.0
4747,A Ashish Reddy,4,7.0
4866,A Ashish Reddy,7,14.0
4933,A Ashish Reddy,14,16.0
5027,A Ashish Reddy,16,4.0
5076,A Ashish Reddy,4,19.0
5160,A Ashish Reddy,19,7.0


In [17]:
# Drop rows where next match runs is NaN
final_data = player_match_stats.dropna().reset_index(drop=True)

# Time-series split
split_index = int(len(final_data) * 0.8)

train_data = final_data.iloc[:split_index]
test_data = final_data.iloc[split_index:]

In [19]:
print(" Train size:", train_data.shape)
print("Test size:", test_data.shape) 

 Train size: (12673, 8)
Test size: (3169, 8)


In [20]:
train_data.to_csv("data/processed/train_features.csv", index=False)
test_data.to_csv("data/processed/test_features.csv", index=False)