In [44]:
import numpy as np
import pandas as pd

In [45]:
from pathlib import Path
from ruamel.yaml import YAML
import json
import datetime as dt

In [46]:
base_dir = Path('datasets/ipl')

In [47]:
yaml_file_list = list(Path(base_dir).glob('*.yaml'))

In [48]:
len(yaml_file_list)

845

In [49]:
def myconverter(o):
    if isinstance(o, dt.date):
        return o.__str__()

In [50]:
def create_data_frame_from_innings_dict(datafile_json, inning_number, innings_dict):
  '''
  This function takes a json data file, the inning_number and the innings_dict and converts it into a dataframe
  INPUT - datafile_json, inning_number, innings_dict 
  OUTPUT - DataFrame
  '''
  df_innings=pd.DataFrame({'col':datafile_json['innings'][inning_number][innings_dict]['deliveries']})
  df_T = df_innings['col'].apply(pd.Series).T
  df_T['main'] = df_T[df_T.columns].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
  df_next = df_T.filter(['main'])
  df_next['main'] = df_next['main'].apply(lambda x : eval(x)) 
  df_next_2 = df_next['main'].apply(pd.Series)
  df_next_try = df_next_2.filter(['batsman','bowler','non_striker','runs','wicket','extras'])

  df_runs = df_next_try[['runs']]
  df_run_details = df_runs['runs'].apply(pd.Series)
  
  if 'wicket' in df_next_try.columns:
    df_wickets = df_next_try[['wicket']]
    df_wicket_details = df_wickets['wicket'].apply(pd.Series)
  else:
    df_wicket_details = pd.DataFrame(columns=['fielders', 'kind', 'player_out'])   
         
  if 'extras' in df_next_try.columns:
    df_extras = df_next_try[['extras']]
    df_extras_details = df_extras['extras'].apply(pd.Series)
  else:
    df_extras_details = pd.DataFrame(columns=['0', 'wides', 'legbyes'])
    
  drop_column_list = ['runs']

  if 'wicket' in df_next_try.columns:
    drop_column_list.append('wicket')

  if 'extras' in df_next_try.columns:
    drop_column_list.append('extras')
  
  df = df_next_try.join(df_run_details,rsuffix='_runs').join(df_wicket_details,rsuffix='_wickets').join(df_extras_details,rsuffix='_extras').drop(columns=drop_column_list)
  df['batting_team'] = datafile_json['innings'][inning_number][innings_dict]['team']
  return df

In [51]:
def create_data_frame_from_info_dict(datafile_json):
  '''
  This function creates a match_summary dataframe given an input json datafile
  '''
  info = pd.DataFrame({'info':datafile_json['info']}).T
  outcome = info['outcome'].apply(pd.Series)
  toss = info['toss'].apply(pd.Series)
  match_summary = info.join(outcome,rsuffix='_outcome').join(toss, rsuffix='_toss').drop(columns=['outcome','toss'])
  return match_summary

In [52]:
counter = 0
info_list = []
ball_by_ball_details = []
for each_file in range(len(yaml_file_list)):
  in_file = yaml_file_list[each_file]
  out_file = 'output.json'


  tyaml = YAML(typ='safe')
  with open(in_file) as fpi:
      data_yaml = tyaml.load(fpi)
  with open(out_file, 'w') as fpo:
      json.dump(data_yaml, fpo, default=myconverter)
    
  with open(out_file, 'r') as f:
      json_file = json.load(f)
      
      
  if len(json_file['innings']) == 2:
    d1 = create_data_frame_from_innings_dict(datafile_json=json_file, inning_number=0, innings_dict='1st innings')
    d2 = create_data_frame_from_innings_dict(datafile_json=json_file, inning_number=1, innings_dict='2nd innings')
    df_match = pd.concat([d1, d2], keys=['inning_1', 'inning_2']).reset_index()
  elif len(json_file['innings']) == 1:
    d1 = create_data_frame_from_innings_dict(datafile_json=json_file, inning_number=0, innings_dict='1st innings')
    d2 = pd.DataFrame()
    df_match = pd.concat([d1, d2], keys=['inning_1', 'inning_2']).reset_index()
  elif len(json_file['innings']) == 4:
    d1 = create_data_frame_from_innings_dict(datafile_json=json_file, inning_number=0, innings_dict='1st innings')
    d2 = create_data_frame_from_innings_dict(datafile_json=json_file, inning_number=1, innings_dict='2nd innings')    
    df_match = pd.concat([d1, d2], keys=['inning_1', 'inning_2']).reset_index()
  else:
    df_match = pd.DataFrame()
  
  df_match['match_id'] = yaml_file_list[each_file].stem
  match_info_summary = create_data_frame_from_info_dict(datafile_json=json_file)
  
  match_info_summary['match_id'] = yaml_file_list[each_file].stem

  
  ball_by_ball_details.append(df_match)
  info_list.append(match_info_summary)
  counter += 1
  if counter % 50 == 0:
    print(counter,' files completed')

50  files completed
100  files completed
150  files completed
200  files completed
250  files completed
300  files completed
350  files completed
400  files completed
450  files completed
500  files completed
550  files completed
600  files completed
650  files completed
700  files completed
750  files completed
800  files completed


In [53]:
df_match_summary = pd.concat(info_list)
df_match_summary = df_match_summary.set_index('match_id')

In [54]:
df_match_summary.head()

Unnamed: 0_level_0,city,competition,dates,gender,match_type,overs,player_of_match,teams,umpires,venue,by,winner,decision,winner_toss,neutral_venue,method,result,eliminator
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
548334,Mumbai,IPL,[2012-04-22],male,T20,20,[SE Marsh],"[Mumbai Indians, Kings XI Punjab]","[S Ravi, RJ Tucker]",Wankhede Stadium,{'wickets': 6},Kings XI Punjab,bat,Mumbai Indians,,,,
548327,Bangalore,IPL,[2012-04-17],male,T20,20,[CH Gayle],"[Royal Challengers Bangalore, Pune Warriors]","[S Asnani, S Das]",M Chinnaswamy Stadium,{'wickets': 6},Royal Challengers Bangalore,bat,Pune Warriors,,,,
1254082,Ahmedabad,IPL,[2021-04-29],male,T20,20,[PP Shaw],"[Kolkata Knight Riders, Delhi Capitals]","[AK Chaudhary, YC Barde]","Narendra Modi Stadium, Ahmedabad",{'wickets': 7},Delhi Capitals,field,Delhi Capitals,,,,
598052,Chandigarh,IPL,[2013-05-09],male,T20,20,[KK Cooper],"[Kings XI Punjab, Rajasthan Royals]","[HDPK Dharmasena, S Ravi]","Punjab Cricket Association Stadium, Mohali",{'wickets': 8},Rajasthan Royals,field,Rajasthan Royals,,,,
392236,Centurion,IPL,[2009-05-21],male,T20,20,[MK Pandey],"[Royal Challengers Bangalore, Deccan Chargers]","[IL Howell, S Ravi]",SuperSport Park,{'runs': 12},Royal Challengers Bangalore,bat,Royal Challengers Bangalore,1.0,,,


In [55]:
df_details = pd.concat(ball_by_ball_details)
df_details.head()

Unnamed: 0,level_0,level_1,batsman,bowler,non_striker,0,batsman_runs,extras_runs,total,0_wickets,...,legbyes,wides,batting_team,0_extras,match_id,byes,noballs,non_boundary,extras,penalty
0,inning_1,0.1,JEC Franklin,P Kumar,SR Tendulkar,,0.0,0.0,0.0,,...,,,Mumbai Indians,,548334,,,,,
1,inning_1,0.2,JEC Franklin,P Kumar,SR Tendulkar,,0.0,0.0,0.0,,...,,,Mumbai Indians,,548334,,,,,
2,inning_1,0.3,JEC Franklin,P Kumar,SR Tendulkar,,0.0,0.0,0.0,,...,,,Mumbai Indians,,548334,,,,,
3,inning_1,0.4,JEC Franklin,P Kumar,SR Tendulkar,,0.0,0.0,0.0,,...,,,Mumbai Indians,,548334,,,,,
4,inning_1,0.5,JEC Franklin,P Kumar,SR Tendulkar,,0.0,0.0,0.0,,...,,,Mumbai Indians,,548334,,,,,


In [56]:
df_details['wides'] = df_details['wides'].astype('float')
df_details['noballs'] = df_details['noballs'].astype('float')
df_details['legbyes'] = df_details['legbyes'].astype('float')


In [57]:
df_details.iloc[:,:].isna().mean()

level_0         0.000000
level_1         0.000000
batsman         0.000055
bowler          0.000055
non_striker     0.000055
0               1.000000
batsman_runs    0.000055
extras_runs     0.003256
total           0.000055
0_wickets       1.000000
fielders        0.965073
kind            0.950931
player_out      0.950931
0               1.000000
legbyes         0.984060
wides           0.969707
batting_team    0.000000
0_extras        1.000000
match_id        0.000000
byes            0.997373
noballs         0.995975
non_boundary    0.999910
extras          0.996799
penalty         0.999990
dtype: float64

In [58]:
df_details.drop(columns=['0','0_extras','0_wickets'], inplace=True, axis=1)
df_details.drop(columns=[0], inplace=True, axis=1)

In [59]:
df_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200247 entries, 0 to 245
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   level_0       200247 non-null  object 
 1   level_1       200247 non-null  object 
 2   batsman       200236 non-null  object 
 3   bowler        200236 non-null  object 
 4   non_striker   200236 non-null  object 
 5   batsman_runs  200236 non-null  float64
 6   extras_runs   199595 non-null  float64
 7   total         200236 non-null  float64
 8   fielders      6994 non-null    object 
 9   kind          9826 non-null    object 
 10  player_out    9826 non-null    object 
 11  legbyes       3192 non-null    float64
 12  wides         6066 non-null    float64
 13  batting_team  200247 non-null  object 
 14  match_id      200247 non-null  object 
 15  byes          526 non-null     float64
 16  noballs       806 non-null     float64
 17  non_boundary  18 non-null      float64
 18  extras 

In [60]:
yaml_match_id_list = [i.stem for i in yaml_file_list]

In [61]:
unmatched_yaml = list(set(yaml_match_id_list) - set(df_details.match_id.unique()))

In [62]:
unmatched_yaml = [i+'.yaml' for i in unmatched_yaml]

In [63]:
unmatched_yaml

['1216517.yaml']

In [64]:
len(json_file['innings'])

2

In [65]:
df_outcome = df_match_summary[['by']]
df_outcome_details = df_outcome['by'].apply(pd.Series).drop(columns=[0])

In [66]:
df_ipl_summary = df_match_summary.join(df_outcome_details)

In [67]:
df_ipl_summary.drop(columns=['by'],axis=1,inplace=True)

In [68]:
df_ipl_summary=df_ipl_summary.rename(index=str, columns={"runs": "outcome_by_runs", "wickets": "outcome_by_wickets", "overs": "#_of_overs"})
df_ipl_summary['match_date'] = df_ipl_summary['dates'].str[0]
df_ipl_summary['match_date'] = pd.to_datetime(df_ipl_summary['match_date'])

df_ipl_summary['season'] = df_ipl_summary['match_date'].dt.year
df_ipl_summary.drop(columns=['dates'],axis=1,inplace=True)
df_ipl_summary.head()

Unnamed: 0_level_0,city,competition,gender,match_type,#_of_overs,player_of_match,teams,umpires,venue,winner,decision,winner_toss,neutral_venue,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
548334,Mumbai,IPL,male,T20,20,[SE Marsh],"[Mumbai Indians, Kings XI Punjab]","[S Ravi, RJ Tucker]",Wankhede Stadium,Kings XI Punjab,bat,Mumbai Indians,,,,,,6.0,2012-04-22,2012
548327,Bangalore,IPL,male,T20,20,[CH Gayle],"[Royal Challengers Bangalore, Pune Warriors]","[S Asnani, S Das]",M Chinnaswamy Stadium,Royal Challengers Bangalore,bat,Pune Warriors,,,,,,6.0,2012-04-17,2012
1254082,Ahmedabad,IPL,male,T20,20,[PP Shaw],"[Kolkata Knight Riders, Delhi Capitals]","[AK Chaudhary, YC Barde]","Narendra Modi Stadium, Ahmedabad",Delhi Capitals,field,Delhi Capitals,,,,,,7.0,2021-04-29,2021
598052,Chandigarh,IPL,male,T20,20,[KK Cooper],"[Kings XI Punjab, Rajasthan Royals]","[HDPK Dharmasena, S Ravi]","Punjab Cricket Association Stadium, Mohali",Rajasthan Royals,field,Rajasthan Royals,,,,,,8.0,2013-05-09,2013
392236,Centurion,IPL,male,T20,20,[MK Pandey],"[Royal Challengers Bangalore, Deccan Chargers]","[IL Howell, S Ravi]",SuperSport Park,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,1.0,,,,12.0,,2009-05-21,2009


In [69]:
df_details=df_details.rename(index=str, columns={"level_0": "inning", "level_1": "delivery"})
df_details.head()

Unnamed: 0,inning,delivery,batsman,bowler,non_striker,batsman_runs,extras_runs,total,fielders,kind,player_out,legbyes,wides,batting_team,match_id,byes,noballs,non_boundary,extras,penalty
0,inning_1,0.1,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,
1,inning_1,0.2,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,
2,inning_1,0.3,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,
3,inning_1,0.4,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,
4,inning_1,0.5,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,


In [70]:
df_details.drop(columns=['extras'],axis=1,inplace=True)

In [71]:
df_ipl_summary[['team1','team2']] = pd.DataFrame(df_ipl_summary.teams.tolist(), index= df_ipl_summary.index)

In [72]:
df_ipl_summary.head()

Unnamed: 0_level_0,city,competition,gender,match_type,#_of_overs,player_of_match,teams,umpires,venue,winner,...,neutral_venue,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season,team1,team2
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
548334,Mumbai,IPL,male,T20,20,[SE Marsh],"[Mumbai Indians, Kings XI Punjab]","[S Ravi, RJ Tucker]",Wankhede Stadium,Kings XI Punjab,...,,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab
548327,Bangalore,IPL,male,T20,20,[CH Gayle],"[Royal Challengers Bangalore, Pune Warriors]","[S Asnani, S Das]",M Chinnaswamy Stadium,Royal Challengers Bangalore,...,,,,,,6.0,2012-04-17,2012,Royal Challengers Bangalore,Pune Warriors
1254082,Ahmedabad,IPL,male,T20,20,[PP Shaw],"[Kolkata Knight Riders, Delhi Capitals]","[AK Chaudhary, YC Barde]","Narendra Modi Stadium, Ahmedabad",Delhi Capitals,...,,,,,,7.0,2021-04-29,2021,Kolkata Knight Riders,Delhi Capitals
598052,Chandigarh,IPL,male,T20,20,[KK Cooper],"[Kings XI Punjab, Rajasthan Royals]","[HDPK Dharmasena, S Ravi]","Punjab Cricket Association Stadium, Mohali",Rajasthan Royals,...,,,,,,8.0,2013-05-09,2013,Kings XI Punjab,Rajasthan Royals
392236,Centurion,IPL,male,T20,20,[MK Pandey],"[Royal Challengers Bangalore, Deccan Chargers]","[IL Howell, S Ravi]",SuperSport Park,Royal Challengers Bangalore,...,1.0,,,,12.0,,2009-05-21,2009,Royal Challengers Bangalore,Deccan Chargers


In [73]:
df_ipl_summary.drop(columns=['teams'],axis=1,inplace=True)

In [74]:
df_ipl_summary.head()

Unnamed: 0_level_0,city,competition,gender,match_type,#_of_overs,player_of_match,umpires,venue,winner,decision,...,neutral_venue,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season,team1,team2
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
548334,Mumbai,IPL,male,T20,20,[SE Marsh],"[S Ravi, RJ Tucker]",Wankhede Stadium,Kings XI Punjab,bat,...,,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab
548327,Bangalore,IPL,male,T20,20,[CH Gayle],"[S Asnani, S Das]",M Chinnaswamy Stadium,Royal Challengers Bangalore,bat,...,,,,,,6.0,2012-04-17,2012,Royal Challengers Bangalore,Pune Warriors
1254082,Ahmedabad,IPL,male,T20,20,[PP Shaw],"[AK Chaudhary, YC Barde]","Narendra Modi Stadium, Ahmedabad",Delhi Capitals,field,...,,,,,,7.0,2021-04-29,2021,Kolkata Knight Riders,Delhi Capitals
598052,Chandigarh,IPL,male,T20,20,[KK Cooper],"[HDPK Dharmasena, S Ravi]","Punjab Cricket Association Stadium, Mohali",Rajasthan Royals,field,...,,,,,,8.0,2013-05-09,2013,Kings XI Punjab,Rajasthan Royals
392236,Centurion,IPL,male,T20,20,[MK Pandey],"[IL Howell, S Ravi]",SuperSport Park,Royal Challengers Bangalore,bat,...,1.0,,,,12.0,,2009-05-21,2009,Royal Challengers Bangalore,Deccan Chargers


In [75]:
df_combined = df_details.join(df_ipl_summary,on='match_id',rsuffix='_ipl_summary')
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200247 entries, 0 to 245
Data columns (total 40 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   inning              200247 non-null  object        
 1   delivery            200247 non-null  object        
 2   batsman             200236 non-null  object        
 3   bowler              200236 non-null  object        
 4   non_striker         200236 non-null  object        
 5   batsman_runs        200236 non-null  float64       
 6   extras_runs         199595 non-null  float64       
 7   total               200236 non-null  float64       
 8   fielders            6994 non-null    object        
 9   kind                9826 non-null    object        
 10  player_out          9826 non-null    object        
 11  legbyes             3192 non-null    float64       
 12  wides               6066 non-null    float64       
 13  batting_team        200247 non-null  

In [76]:
df_combined['batting_team'].head()

0    Mumbai Indians
1    Mumbai Indians
2    Mumbai Indians
3    Mumbai Indians
4    Mumbai Indians
Name: batting_team, dtype: object

In [77]:
df_combined['bowling_team'] = np.where(df_combined['team1']!= df_combined['batting_team'], df_combined['team1'], df_combined['team2'])

In [78]:
df_combined.head()

Unnamed: 0,inning,delivery,batsman,bowler,non_striker,batsman_runs,extras_runs,total,fielders,kind,...,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season,team1,team2,bowling_team
0,inning_1,0.1,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,...,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab,Kings XI Punjab
1,inning_1,0.2,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,...,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab,Kings XI Punjab
2,inning_1,0.3,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,...,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab,Kings XI Punjab
3,inning_1,0.4,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,...,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab,Kings XI Punjab
4,inning_1,0.5,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,...,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab,Kings XI Punjab


In [43]:
df_combined.to_csv('datasets/details.csv')

In [79]:
df_ipl_summary.to_csv('datasets/ipl-matches.csv')