In [1]:
import pandas as pd
from joblib import Parallel, delayed
import warnings, json, ast, duckdb

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',500)

In [2]:
df = pd.read_csv('../output/json_extracted_data.csv')

In [3]:
# Select only male player matches
df = df[df['info.gender'] == 'male'][['match_id','innings','info.outcome.winner','info.teams','info.venue']].head(2)

In [4]:
df

Unnamed: 0,match_id,innings,info.outcome.winner,info.teams,info.venue
0,1,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,2,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground


In [71]:
df['innings'] = Parallel(n_jobs=-1)(delayed(ast.literal_eval)(x) for x in df['innings'])

In [72]:
test_df = df.copy()
test_df

Unnamed: 0,match_id,innings,info.outcome.winner,info.teams,info.venue
0,1,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,2,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground


In [74]:
# pd.json_normalize(
#     test_df.explode('innings').to_dict(orient='records'),
#     record_path=['innings', 'overs'],
#     meta=['innings.team', 'info.venue'],
#     errors='ignore'
# )

In [75]:
test_df2 = test_df.explode('innings')
test_df2

Unnamed: 0,match_id,innings,info.outcome.winner,info.teams,info.venue
0,1,"{'team': 'Australia', 'overs': [{'over': 0, 'd...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
0,1,"{'team': 'Pakistan', 'overs': [{'over': 0, 'de...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,2,"{'team': 'Australia', 'overs': [{'over': 0, 'd...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground
1,2,"{'team': 'Pakistan', 'overs': [{'over': 0, 'de...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground


In [84]:
# working logic

pd.json_normalize(
    test_df2.to_dict(orient='records'),
    record_path=['innings','overs','deliveries'],  # Adjust if deliveries are nested
    meta=['match_id','innings.team', 'info.outcome.winner', 'info.teams', 'info.venue'],
    errors='ignore'
) #[['match_id','innings.team','batter','bowler','runs.total','info.teams','info.venue']].head()

Unnamed: 0,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.wides,wickets,runs.non_boundary,review.by,review.umpire,review.batter,review.decision,extras.legbyes,extras.noballs,extras.byes,replacements.role,match_id,innings.team,info.outcome.winner,info.teams,info.venue
0,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
2,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
3,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
4,DA Warner,Mohammad Amir,TM Head,0,1,1,1.0,,,,,,,,,,,1,,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,Umar Akmal,TM Head,Shoaib Malik,0,0,0,,,,,,,,,,,,2,,Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground
1157,Umar Akmal,TM Head,Shoaib Malik,4,0,4,,,,,,,,,,,,2,,Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground
1158,Umar Akmal,TM Head,Shoaib Malik,0,0,0,,,,,,,,,,,,2,,Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground
1159,Umar Akmal,TM Head,Shoaib Malik,4,0,4,,,,,,,,,,,,2,,Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground


In [None]:
###################################################################################

In [81]:
test_df

Unnamed: 0,match_id,innings,info.outcome.winner,info.teams,info.venue
0,1,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,2,"[{'team': 'Australia', 'overs': [{'over': 0, '...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground


In [85]:
# Explode the 'innings' column
df_exploded = test_df.explode('innings')

# Extract 'team' separately
df_exploded['innings_team'] = df_exploded['innings'].apply(lambda x: x.get('team') if isinstance(x, dict) else None)

# Normalize the rest of the data
normalized_df = pd.json_normalize(
    df_exploded.to_dict(orient='records'),
    record_path=['innings', 'overs', 'deliveries'],
    meta=['match_id', 'innings_team', 'info.outcome.winner', 'info.teams', 'info.venue'],
    errors='ignore'
)

# Display the result
normalized_df.head()

Unnamed: 0,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.wides,wickets,runs.non_boundary,review.by,review.umpire,review.batter,review.decision,extras.legbyes,extras.noballs,extras.byes,replacements.role,match_id,innings_team,info.outcome.winner,info.teams,info.venue
0,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,Australia,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
1,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,Australia,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
2,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,Australia,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
3,DA Warner,Mohammad Amir,TM Head,0,0,0,,,,,,,,,,,,1,Australia,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"
4,DA Warner,Mohammad Amir,TM Head,0,1,1,1.0,,,,,,,,,,,1,Australia,Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba"


In [93]:
abc = test_df.explode('innings')
abc['innings_team'] = df_exploded['innings'].apply(lambda x: x.get('team') if isinstance(x, dict) else None)
abc

Unnamed: 0,match_id,innings,info.outcome.winner,info.teams,info.venue,innings_team
0,1,"{'team': 'Australia', 'overs': [{'over': 0, 'd...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba",Australia
0,1,"{'team': 'Pakistan', 'overs': [{'over': 0, 'de...",Australia,"['Australia', 'Pakistan']","Brisbane Cricket Ground, Woolloongabba",Pakistan
1,2,"{'team': 'Australia', 'overs': [{'over': 0, 'd...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground,Australia
1,2,"{'team': 'Pakistan', 'overs': [{'over': 0, 'de...",Pakistan,"['Australia', 'Pakistan']",Melbourne Cricket Ground,Pakistan


In [None]:
#########################################################################################################

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../output/deliveries_final.csv')

In [7]:
df.venue.unique()

array(['Brisbane Cricket Ground, Woolloongabba',
       'Melbourne Cricket Ground',
       'Western Australia Cricket Association Ground',
       'Sydney Cricket Ground', 'Adelaide Oval', 'Manuka Oval',
       'Hagley Oval', 'Saxton Oval', 'Eden Park', 'Seddon Park',
       'Westpac Stadium', 'Kennington Oval', 'Edgbaston',
       'Sophia Gardens', 'Sir Vivian Richards Stadium, North Sound',
       'Kensington Oval, Bridgetown', 'Shere Bangla National Stadium',
       'Zahur Ahmed Chowdhury Stadium', 'Feroz Shah Kotla',
       'Punjab Cricket Association Stadium, Mohali',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Headingley', 'The Rose Bowl', "Lord's", 'Old Trafford',
       'County Ground', 'Clontarf Cricket Club Ground',
       'Maharashtra Cricket Association Stadium', 'Barabati Stadium',
       'Eden Gardens', 'Sharjah Cricket Stadium', 'Sheikh Zayed Stadium',
       'Harare Sports Club', 'Queens Sports Club', 'Providence Stadium',
       'Rangiri Dambul