In [1]:
import pandas as pd


In [38]:
deliveries = pd.read_parquet(r'C:\Users\amadh\OneDrive\Desktop\Project Series\Cricket-Prediction\historical_data\cricsheetSource\deliveries.parquet')
match_players = pd.read_parquet(r'C:\Users\amadh\OneDrive\Desktop\Project Series\Cricket-Prediction\historical_data\cricsheetSource\match_players.parquet')
matches = pd.read_parquet(r'C:\Users\amadh\OneDrive\Desktop\Project Series\Cricket-Prediction\historical_data\cricsheetSource\matches.parquet')

In [39]:
#print(deliveries.head())
print(deliveries.columns)

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')


In [40]:
print(match_players.columns)

Index(['team1_player1', 'team1_player2', 'team1_player3', 'team1_player4',
       'team1_player5', 'team1_player6', 'team1_player7', 'team1_player8',
       'team1_player9', 'team1_player10', 'team1_player11', 'team2_players1',
       'team2_players2', 'team2_players3', 'team2_players4', 'team2_players5',
       'team2_players6', 'team2_players7', 'team2_players8', 'team2_players9',
       'team2_players10', 'team2_players11', 'match_id'],
      dtype='object')


In [41]:
print(matches.columns)

Index(['team1', 'team2', 'gender', 'season', 'date', 'venue', 'city',
       'toss_winner', 'toss_decision', 'winner', 'match_id'],
      dtype='object')


In [49]:
# Merge deliveries with matches
merged_data = pd.merge(deliveries, matches, on='match_id', suffixes=('_delivery', '_match'))


In [50]:
# Calculate cumulative runs
merged_data['cumulative_runs'] = merged_data.groupby(['match_id', 'innings'])['runs_off_bat'].cumsum() + merged_data.groupby(['match_id', 'innings'])['extras'].cumsum()


In [51]:
# Ensure player_dismissed is a boolean indicating whether a player was dismissed
merged_data['is_wicket'] = merged_data['player_dismissed'].notnull().astype(int)


In [52]:
# Calculate cumulative wickets
merged_data['cumulative_wickets'] = merged_data.groupby(['match_id', 'innings'])['is_wicket'].transform('cumsum')


In [53]:
# Calculate current run rate
merged_data['current_run_rate'] = merged_data['cumulative_runs'] / (merged_data['ball'] / 6 + (merged_data['ball'] % 6) / 6)


In [54]:
# Calculate target runs from the first innings
first_innings_runs = merged_data[merged_data['innings'] == 1].groupby('match_id')['cumulative_runs'].max().reset_index()
first_innings_runs.columns = ['match_id', 'target_runs']
merged_data = pd.merge(merged_data, first_innings_runs, on='match_id', how='left')


In [55]:
# Calculate required run rate
merged_data['required_run_rate'] = (merged_data['target_runs'] - merged_data['cumulative_runs']) / (20 - (merged_data['ball'] / 6 + (merged_data['ball'] % 6) / 6))


In [56]:
# Calculate player statistics
# Batting statistics
player_runs = deliveries.groupby('striker')['runs_off_bat'].sum().reset_index()
player_runs.columns = ['player_id', 'total_runs']
player_balls_faced = deliveries[deliveries['wides'] == 0].groupby('striker').size().reset_index(name='balls_faced')
player_stats = pd.merge(player_runs, player_balls_faced, left_on='player_id', right_on='striker')
player_stats['strike_rate'] = player_stats['total_runs'] / player_stats['balls_faced'] * 100


In [57]:
bowler_runs_conceded = deliveries.groupby('bowler')['runs_off_bat'].sum().reset_index()
bowler_runs_conceded.columns = ['player_id', 'runs_conceded']
balls_bowled = deliveries[(deliveries['wides'] == 0) & (deliveries['noballs'] == 0)].groupby('bowler').size().reset_index(name='balls_bowled')
bowler_stats = pd.merge(bowler_runs_conceded, balls_bowled, left_on='player_id', right_on='bowler')
bowler_stats['economy_rate'] = bowler_stats['runs_conceded'] / (bowler_stats['balls_bowled'] / 6)


In [58]:
# Combine batting and bowling statistics
player_stats = pd.merge(player_stats, bowler_stats, on='player_id', how='outer', suffixes=('_batting', '_bowling'))


In [59]:
# Rename columns to avoid conflicts during merge
player_stats_renamed = player_stats.rename(columns={
    'total_runs': 'total_runs_batting',
    'balls_faced': 'balls_faced_batting',
    'strike_rate': 'strike_rate_batting',
    'runs_conceded': 'runs_conceded_bowling',
    'balls_bowled': 'balls_bowled_bowling',
    'economy_rate': 'economy_rate_bowling'
})

In [60]:
for col in match_players.columns:
    if col != 'match_id':
        merged_data = pd.merge(merged_data, match_players[['match_id', col]], on='match_id', how='left')
        merged_data = pd.merge(
            merged_data, 
            player_stats_renamed, 
            left_on=col, 
            right_on='player_id', 
            how='left', 
            suffixes=('', f'_{col}')
        )
        merged_data = merged_data.drop(columns=['player_id'])


In [61]:
# Add match outcome as the target variable
matches['result'] = matches.apply(lambda x: 1 if x['winner'] == x['team1'] else 0, axis=1)
merged_data = pd.merge(merged_data, matches[['match_id', 'result']], on='match_id')


In [62]:
# Select relevant features and create the final dataset
features = ['cumulative_runs', 'cumulative_wickets', 'current_run_rate', 'required_run_rate', 'total_runs_batting', 'strike_rate_batting', 'runs_conceded_bowling', 'economy_rate_bowling']
target = 'result'

In [63]:
final_dataset = merged_data[features + [target]]

In [65]:
# Save the final dataset
final_dataset.to_csv('final_dataset_with_player_stats.csv', index=False)

In [66]:
# Save the final dataset as a Parquet file
final_dataset.to_parquet('Parquet_final_dataset_with_player_stats.parquet', index=False)
