In [None]:
#  import necessary libraries
import pandas as pd
import numpy as np

# load cleaned datasets
matches = pd.read_csv("matches_cleaned.csv")
deliveries = pd.read_csv("deliveries_cleaned.csv")

# display the shapes of the datasets
print(matches.shape)
print(deliveries.shape)

(1095, 20)
(260920, 17)


In [None]:
# crete player match level data (strike rate calculation)
# conver balls to balls faced by batsman

player_match = deliveries.groupby(['match_id', 'batter']).agg({
    'batsman_runs': 'sum',
    'ball': 'count'
}).reset_index()

player_match.rename(columns={
    'batter': 'player',
    'batsman_runs': 'runs',
    'ball': 'balls'
}, inplace=True)

player_match['strike_rate'] = (player_match['runs'] / player_match['balls']) * 100

player_match.head()


Unnamed: 0,match_id,player,runs,balls,strike_rate
0,335982,AA Noffke,9,12,75.0
1,335982,B Akhil,0,2,0.0
2,335982,BB McCullum,158,77,205.194805
3,335982,CL White,6,10,60.0
4,335982,DJ Hussey,12,12,100.0


In [None]:
# merge match info to player_match data
# add season, venue, teams, winner info
match_info = matches[['id','season','venue','team1','team2','winner']]

player_match = player_match.merge(match_info, left_on='match_id', right_on='id')
player_match.drop('id', axis=1, inplace=True)

player_match.head()


Unnamed: 0,match_id,player,runs,balls,strike_rate,season,venue,team1,team2,winner
0,335982,AA Noffke,9,12,75.0,2007/08,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders
1,335982,B Akhil,0,2,0.0,2007/08,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders
2,335982,BB McCullum,158,77,205.194805,2007/08,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders
3,335982,CL White,6,10,60.0,2007/08,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders
4,335982,DJ Hussey,12,12,100.0,2007/08,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders


In [None]:
# sort for Time Series Analysis

player_match = player_match.sort_values(by=['player','match_id'])


In [None]:
# rolling averages for last 5 matches

player_match['avg_last_5'] = player_match.groupby('player')['runs']\
    .rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

player_match['sr_last_5'] = player_match.groupby('player')['strike_rate']\
    .rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)


In [None]:
# average runs per venue

venue_avg = player_match.groupby(['player','venue'])['runs'].mean().reset_index()
venue_avg.rename(columns={'runs':'venue_avg_runs'}, inplace=True)

player_match = player_match.merge(venue_avg, on=['player','venue'], how='left')


In [None]:
# create target variable - runs in next match

player_match['next_match_runs'] = player_match.groupby('player')['runs'].shift(-1)
player_match = player_match.dropna()


In [None]:
# save ML dataset 

player_match.to_csv("player_ml_dataset.csv", index=False)
print("ML dataset created successfully!")

ML dataset created successfully!


In [None]:
# preview of final dataset

player_match.head()
player_match.shape

(15842, 14)