# Data Extraction

In [3]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [5]:
filenames = []
for file in os.listdir('data'):
    filenames.append(os.path.join('data',file))

In [6]:
all_balls = []
counter = 0

for file in tqdm(filenames):
    try:
        with open(file, 'r') as f:
            data = safe_load(f)
            
            # Extract match info for context
            match_info = data.get('info', {})
            venue = match_info.get('city', 'Unknown')
            
            # Extract innings data
            innings = data.get('innings', [])
            
            for inning in innings:
                inning_name = list(inning.keys())[0]
                inning_data = inning[inning_name]
                
                team = inning_data.get('team')
                deliveries = inning_data.get('deliveries', [])
                
                # Process each delivery
                for delivery_dict in deliveries:
                    for over_ball, ball_data in delivery_dict.items():
                        ball_data['over.ball'] = over_ball
                        ball_data['innings'] = inning_name
                        ball_data['batting_team'] = team
                        ball_data['match_id'] = counter
                        ball_data['venue'] = venue
                        all_balls.append(ball_data)
            
            counter += 1
            
    except Exception as e:
        print(f"Error processing {file}: {e}")
        continue

final_df = pd.json_normalize(all_balls)
final_df

100%|███████████████████████████████████████| 4801/4801 [10:12<00:00,  7.83it/s]


Unnamed: 0,bowler,non_striker,batsman,over.ball,innings,batting_team,match_id,venue,runs.extras,runs.total,...,wicket.kind,extras.wides,extras.byes,extras.legbyes,extras.noballs,replacements.role,replacements.match,runs.non_boundary,wicket,extras.penalty
0,C Wright,NS Dhaliwal,A Johnson,0.1,1st innings,Canada,0,Hamilton,0,0,...,,,,,,,,,,
1,C Wright,NS Dhaliwal,A Johnson,0.2,1st innings,Canada,0,Hamilton,0,1,...,,,,,,,,,,
2,C Wright,A Johnson,NS Dhaliwal,0.3,1st innings,Canada,0,Hamilton,0,1,...,,,,,,,,,,
3,C Wright,NS Dhaliwal,A Johnson,0.4,1st innings,Canada,0,Hamilton,0,2,...,,,,,,,,,,
4,C Wright,NS Dhaliwal,A Johnson,0.5,1st innings,Canada,0,Hamilton,0,0,...,caught,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085954,Panna Ghosh,HHC Gunaratne,HAM Samuddika,19.2,2nd innings,Sri Lanka,4800,Unknown,0,0,...,,,,,,,,,,
1085955,Panna Ghosh,HHC Gunaratne,HAM Samuddika,19.3,2nd innings,Sri Lanka,4800,Unknown,0,2,...,,,,,,,,,,
1085956,Panna Ghosh,HHC Gunaratne,HAM Samuddika,19.4,2nd innings,Sri Lanka,4800,Unknown,0,2,...,,,,,,,,,,
1085957,Panna Ghosh,HHC Gunaratne,HAM Samuddika,19.5,2nd innings,Sri Lanka,4800,Unknown,0,4,...,,,,,,,,,,


In [7]:
final_df.drop(columns=[
    'runs.extras',         
    'extras.wides',       
    'extras.byes',        
    'extras.legbyes',     
    'extras.noballs',     
    'replacements.role',   
    'replacements.match',  
    'runs.non_boundary',    
    'extras.penalty', 
],inplace=True)

In [12]:
final_df.head(20)

Unnamed: 0,bowler,non_striker,batsman,over.ball,innings,batting_team,match_id,venue,runs.total,runs.batsman,wicket.player_out,wicket.fielders,wicket.kind,wicket
0,C Wright,NS Dhaliwal,A Johnson,0.1,1st innings,Canada,0,Hamilton,0,0,,,,
1,C Wright,NS Dhaliwal,A Johnson,0.2,1st innings,Canada,0,Hamilton,1,1,,,,
2,C Wright,A Johnson,NS Dhaliwal,0.3,1st innings,Canada,0,Hamilton,1,1,,,,
3,C Wright,NS Dhaliwal,A Johnson,0.4,1st innings,Canada,0,Hamilton,2,2,,,,
4,C Wright,NS Dhaliwal,A Johnson,0.5,1st innings,Canada,0,Hamilton,0,0,A Johnson,[P Heron],caught,
5,C Wright,NS Dhaliwal,Pargat Singh,0.6,1st innings,Canada,0,Hamilton,1,1,,,,
6,M Manivannan,NS Dhaliwal,Pargat Singh,1.1,1st innings,Canada,0,Hamilton,1,0,,,,
7,M Manivannan,NS Dhaliwal,Pargat Singh,1.2,1st innings,Canada,0,Hamilton,0,0,,,,
8,M Manivannan,NS Dhaliwal,Pargat Singh,1.3,1st innings,Canada,0,Hamilton,1,1,,,,
9,M Manivannan,Pargat Singh,NS Dhaliwal,1.4,1st innings,Canada,0,Hamilton,1,1,,,,


In [16]:
df_1st = df[df['innings'] == '1st innings'].copy()
df_1st.head()

Unnamed: 0,bowler,non_striker,batsman,over.ball,innings,batting_team,match_id,venue,runs.total,runs.batsman,wicket.player_out,wicket.fielders,wicket.kind,wicket
0,C Wright,NS Dhaliwal,A Johnson,0.1,1st innings,Canada,0,Hamilton,0,0,,,,
1,C Wright,NS Dhaliwal,A Johnson,0.2,1st innings,Canada,0,Hamilton,1,1,,,,
2,C Wright,A Johnson,NS Dhaliwal,0.3,1st innings,Canada,0,Hamilton,1,1,,,,
3,C Wright,NS Dhaliwal,A Johnson,0.4,1st innings,Canada,0,Hamilton,2,2,,,,
4,C Wright,NS Dhaliwal,A Johnson,0.5,1st innings,Canada,0,Hamilton,0,0,A Johnson,[P Heron],caught,


In [17]:
print(len(df_1st))
print(df_1st['innings'].unique())

585519
['1st innings']


In [19]:
balls = df_1st.groupby('match_id').size()
balls.head()

match_id
0    125
1    125
2    127
3    130
4    131
dtype: int64

In [23]:
valid_matches = balls[(balls >= 100) & (balls <= 150)].index

In [25]:
df_filtered = df_1st[df_1st['match_id'].isin(valid_matches)].copy()