In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

# INLINE Week 1-2 cleaning 
ball = pd.read_csv("deliveries.csv")
matches = pd.read_csv("matches.csv")

ball.columns = ball.columns.str.lower().str.replace(" ", "_")
matches.columns = matches.columns.str.lower().str.replace(" ", "_")

ball['runs_of_bat'] = ball['runs_of_bat'].fillna(0)
ball['wicket_type'] = ball['wicket_type'].fillna('Unknown')

ipl = ball.merge(matches[['match_id', 'date', 'venue', 'season', 'team1', 'team2']], 
                 on='match_id', how='left')

print(" INLINE Cleaning Complete!")
print(f"Shape: {ipl.shape}")
ipl.head(2)


 INLINE Cleaning Complete!
Shape: (17103, 26)


Unnamed: 0,match_id,season_x,match_no,date_x,venue_x,batting_team,bowling_team,innings,over,striker,...,byes,noballs,wicket_type,player_dismissed,fielder,date_y,venue_y,season_y,team1,team2
0,202401,2024,1,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai, Chennai",RCB,CSK,1,0.1,V Kohli,...,0,0,Unknown,,,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai",2024,RCB,CSK
1,202401,2024,1,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai, Chennai",RCB,CSK,1,0.1,V Kohli,...,0,0,Unknown,,,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai",2024,RCB,CSK


In [3]:
print("Your ipl columns:")
print([col for col in ipl.columns if 'date' in col.lower() or 'venue' in col.lower()])
print("\nstriker exists?", 'striker' in ipl.columns)
print("match_id exists?", 'match_id' in ipl.columns)
ipl.head(1)


Your ipl columns:
['date_x', 'venue_x', 'date_y', 'venue_y']

striker exists? True
match_id exists? True


Unnamed: 0,match_id,season_x,match_no,date_x,venue_x,batting_team,bowling_team,innings,over,striker,...,byes,noballs,wicket_type,player_dismissed,fielder,date_y,venue_y,season_y,team1,team2
0,202401,2024,1,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai, Chennai",RCB,CSK,1,0.1,V Kohli,...,0,0,Unknown,,,2024-03-22,"MA Chidambaram Stadium, Chepauk, Chennai",2024,RCB,CSK


In [4]:
# 1️⃣ AGGREGATE: Use YOUR column names
player_match = ipl.groupby(['striker', 'match_id', 'date_x', 'venue_x', 'batting_team']).agg({
    'runs_of_bat': ['sum', 'count'],           # total runs, balls faced
    'wicket_type': lambda x: (x != 'Unknown').sum()  # dismissed (0/1)
}).reset_index()

# Flatten columns EXACTLY (5 groupby + 3 agg = 8 cols)
player_match.columns = ['player', 'match_id', 'date', 'venue', 'team', 
                       'runs', 'balls_faced', 'dismissed']

# Add strike rate
player_match['strike_rate'] = (player_match['runs'] / player_match['balls_faced']) * 100

# Filter minimum balls + sort
player_match = player_match[player_match['balls_faced'] >= 5].sort_values(['player', 'date'])
player_match['date'] = pd.to_datetime(player_match['date'])

print("✅ Player-Match Aggregation: ", len(player_match), " rows")
player_match.head()


✅ Player-Match Aggregation:  866  rows


Unnamed: 0,player,match_id,date,venue,team,runs,balls_faced,dismissed,strike_rate
0,A Badoni,202404,2024-03-24,"Sawai Mansingh Stadium, Jaipur, Jaipur",LSG,1,5,1,20.0
1,A Badoni,202411,2024-03-30,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,LSG,8,10,1,80.0
3,A Badoni,202421,2024-04-07,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,LSG,20,12,1,166.666667
4,A Badoni,202426,2024-04-12,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,LSG,55,38,0,144.736842
5,A Badoni,202428,2024-04-14,"Eden Gardens, Kolkata, Kolkata",LSG,29,27,1,107.407407


In [5]:
# 2️⃣ Create TARGET: Next match runs
player_match['next_runs'] = player_match.groupby('player')['runs'].shift(-1)
player_match.dropna(subset=['next_runs'], inplace=True)

print("✅ Labels Created:", len(player_match), "rows")
player_match[['player', 'runs', 'next_runs', 'date']].head()


✅ Labels Created: 720 rows


Unnamed: 0,player,runs,next_runs,date
0,A Badoni,1,8.0,2024-03-24
1,A Badoni,8,20.0,2024-03-30
3,A Badoni,20,55.0,2024-04-07
4,A Badoni,55,29.0,2024-04-12
5,A Badoni,29,18.0,2024-04-14


In [6]:
# 3️⃣ FORM: Rolling averages
player_match['form_runs_5'] = player_match.groupby('player')['runs'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)
player_match['form_sr_5'] = player_match.groupby('player')['strike_rate'].rolling(5, min_periods=1).mean().reset_index(0, drop=True)

print("✅ Form Features Added")
player_match[['form_runs_5', 'form_sr_5']].head()


✅ Form Features Added


Unnamed: 0,form_runs_5,form_sr_5
0,1.0,20.0
1,4.5,50.0
3,9.666667,88.888889
4,21.0,102.850877
5,22.6,103.762183


In [7]:
# 3️⃣ VENUE: Player avg at this venue
player_match['venue_runs_avg'] = player_match.groupby(['player', 'venue'])['runs'].transform('mean')
print("✅ Venue Features Added")
player_match[['venue', 'venue_runs_avg']].head()


✅ Venue Features Added


Unnamed: 0,venue,venue_runs_avg
0,"Sawai Mansingh Stadium, Jaipur, Jaipur",1.0
1,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,20.333333
3,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,20.333333
4,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,20.333333
5,"Eden Gardens, Kolkata, Kolkata",29.0


In [8]:
# 3️⃣ PvT: Get opponent team for each match
opponent_map = ipl.groupby('match_id')['bowling_team'].first().reset_index()
opponent_map.columns = ['match_id', 'opponent_team']

# MERGE (safe, no reindex error)
player_match = player_match.merge(opponent_map, on='match_id', how='left')

# PvT average vs this opponent
player_match['PvT_runs_avg'] = player_match.groupby(['player', 'opponent_team'])['runs'].transform('mean')

print("✅ PvT Features Added")
player_match[['player', 'opponent_team', 'PvT_runs_avg']].head()


✅ PvT Features Added


Unnamed: 0,player,opponent_team,PvT_runs_avg
0,A Badoni,LSG,7.0
1,A Badoni,PBKS,8.0
2,A Badoni,GT,20.0
3,A Badoni,DC,55.0
4,A Badoni,KKR,29.0


In [9]:
# 3️⃣ PvP: Main bowler per match
bowler_map = ipl.groupby('match_id')['bowler'].agg(lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown').reset_index()
bowler_map.columns = ['match_id', 'main_bowler']

# MERGE (safe)
player_match = player_match.merge(bowler_map, on='match_id', how='left')

# PvP average vs this bowler
player_match['PvP_runs_avg'] = player_match.groupby(['player', 'main_bowler'])['runs'].transform('mean')

print("✅ PvP Features Added")
player_match[['player', 'main_bowler', 'PvP_runs_avg']].head()


✅ PvP Features Added


Unnamed: 0,player,main_bowler,PvP_runs_avg
0,A Badoni,Naveen-ul-Haq,1.0
1,A Badoni,HV Patel,8.0
2,A Badoni,M Siddharth,20.0
3,A Badoni,I Sharma,30.5
4,A Badoni,S Joseph,29.0


In [10]:
# 3️⃣ CAREER: Reset index + simple calculations
player_match = player_match.reset_index(drop=True)  # FIX MultiIndex

# Calculate career stats SAFELY
career_runs = player_match.groupby('player')['runs'].expanding().mean().reset_index(0, drop=True)
player_match['career_runs_avg'] = career_runs

player_match['matches_played'] = player_match.groupby('player').cumcount() + 1

print("✅ Career Features Added")
player_match[['player', 'career_runs_avg', 'matches_played']].head()


✅ Career Features Added


Unnamed: 0,player,career_runs_avg,matches_played
0,A Badoni,1.0,1
1,A Badoni,4.5,2
2,A Badoni,9.666667,3
3,A Badoni,21.0,4
4,A Badoni,22.6,5


In [11]:
# 4️⃣ SPLIT: Chronological 80/20
player_match = player_match.sort_values('date')
split_idx = int(len(player_match) * 0.8)

train = player_match.iloc[:split_idx].copy()
test = player_match.iloc[split_idx:].copy()

print("✅ Split: Train", len(train), "| Test", len(test))


✅ Split: Train 576 | Test 144


In [12]:
# 5️⃣ FEATURES (fill NaN first)
features = ['form_runs_5', 'form_sr_5', 'venue_runs_avg', 'PvT_runs_avg', 
           'PvP_runs_avg', 'career_runs_avg', 'matches_played']

# Fill ALL NaN with 0 (safety)
for feature in features:
    if feature in player_match.columns:
        player_match[feature] = player_match[feature].fillna(0)

X_train = train[features].copy()
y_train = train['next_runs'].copy()
X_test = test[features].copy()
y_test = test['next_runs'].copy()

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([('scaler', StandardScaler())])
X_train_scaled = pipeline.fit_transform(X_train)
X_test_scaled = pipeline.transform(X_test)

print("✅ Pipeline Ready!")
print(f"Features: {len(features)}")
print(f"X_train shape: {X_train_scaled.shape}")


✅ Pipeline Ready!
Features: 7
X_train shape: (576, 7)


In [16]:
# FINAL dataset.csv (train data only)
final_dataset = pd.DataFrame(X_train_scaled, columns=features)
final_dataset['target'] = y_train.values
final_dataset.to_csv("dataset.csv", index=False)
print("✅ dataset.csv saved! Shape:", final_dataset.shape)

# Save pipeline
import joblib
joblib.dump(pipeline, "feature_pipeline.pkl")
print("✅ feature_pipeline.pkl saved!")
print("• Aggregate ball-by-ball")
print("• Rolling for") 
print("• Venue/PvT/PvP/Career")
print("• Next-match labels")
print("• Time-series split")


✅ dataset.csv saved! Shape: (576, 8)
✅ feature_pipeline.pkl saved!
• Aggregate ball-by-ball
• Rolling for
• Venue/PvT/PvP/Career
• Next-match labels
• Time-series split
