# MINOR PROJECT -- Cricket Score Predictor (using Machine Learning)
## 1) Data Extraction 

In [92]:
import numpy as np
import pandas as pd
from yaml import safe_load  ## Use to convert data from yaml format to pandas
import os  ## Use to extract the name and path of each yaml file in data file
from tqdm import tqdm
import pickle

# (i) For T20

In [26]:
filenames = []
for file in os.listdir('t20s'):
    filenames.append(os.path.join('t20s',file))

In [27]:
filenames[0:5]

['t20s/1202243.yaml',
 't20s/1127300.yaml',
 't20s/543884.yaml',
 't20s/1120291.yaml',
 't20s/682955.yaml']

In [28]:
final_df = pd.DataFrame()
counter = 1

for file in tqdm(filenames):   ## Loop to iterate through each file
    with open(file, 'r') as f:
        try:
            df = pd.json_normalize(safe_load(f)) ## json_normalizing each file and handelling yaml fils with safe_load()
            df['match_id'] = counter
            final_df = pd.concat([final_df, df], ignore_index=True)
            counter += 1
        except NotImplementedError: ##Exception handelingcfor the file in which Normalization is not implemented 
            print(f"Normalization not implemented for file: {file}")

final_df

 80%|██████████████████████████████████▌        | 1151/1433 [02:11<00:26, 10.71it/s]

Normalization not implemented for file: t20s/README.txt


100%|███████████████████████████████████████████| 1433/1433 [02:44<00:00,  8.72it/s]


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.city,info.dates,info.gender,info.match_type,info.match_type_number,info.outcome.winner,...,match_id,info.outcome.by.runs,info.neutral_venue,info.outcome.result,info.outcome.eliminator,info.outcome.method,info.bowl_out,info.outcome.bowl_out,info.supersubs.New Zealand,info.supersubs.South Africa
0,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.9,2020-01-09,1,Indore,[2020-01-07],male,T20,1026.0,India,...,1,,,,,,,,,
1,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.9,2020-07-06,1,Edinburgh,[2018-06-12],male,T20,671.0,Pakistan,...,2,48.0,,,,,,,,
2,"[{'1st innings': {'team': 'Kenya', 'deliveries...",0.9,2012-04-16,2,,[2012-02-23],male,T20,,Ireland,...,3,,,,,,,,,
3,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.9,2020-08-08,1,Abu Dhabi,[2017-10-26],male,T20,625.0,Pakistan,...,4,,,,,,,,,
4,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.9,2014-03-31,1,Chittagong,[2014-03-31],male,T20,,Sri Lanka,...,5,59.0,1.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Thailand', 'deliver...",0.9,2020-01-06,1,Bangkok,[2019-02-25],female,T20,593.0,Thailand,...,1428,87.0,,,,,,,,
1428,"[{'1st innings': {'team': 'Ireland', 'deliveri...",0.9,2020-03-30,1,Al Amarat,[2019-02-13],male,T20,740.0,Ireland,...,1429,15.0,,,,,,,,
1429,"[{'1st innings': {'team': 'West Indies', 'deli...",0.9,2020-06-27,1,Trinidad,[2018-10-06],female,T20,508.0,South Africa,...,1430,,,,,,,,,
1430,"[{'1st innings': {'team': 'India', 'deliveries...",0.9,2019-10-11,1,Surat,[2019-10-01],female,T20,772.0,India,...,1431,51.0,,,,,,,,


In [29]:
backup = final_df.copy() ##Creating a backup of converted data

In [30]:
print(final_df.columns)


Index(['innings', 'meta.data_version', 'meta.created', 'meta.revision',
       'info.city', 'info.dates', 'info.gender', 'info.match_type',
       'info.match_type_number', 'info.outcome.winner',
       'info.outcome.by.wickets', 'info.overs', 'info.player_of_match',
       'info.teams', 'info.toss.decision', 'info.toss.winner', 'info.umpires',
       'info.venue', 'match_id', 'info.outcome.by.runs', 'info.neutral_venue',
       'info.outcome.result', 'info.outcome.eliminator', 'info.outcome.method',
       'info.bowl_out', 'info.outcome.bowl_out', 'info.supersubs.New Zealand',
       'info.supersubs.South Africa'],
      dtype='object')


In [31]:
## removing unnecessary columns from the data frame

final_df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets'
],inplace=True)

In [32]:
final_df


Unnamed: 0,innings,info.city,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Indore,[2020-01-07],male,T20,India,20,[NA Saini],"[India, Sri Lanka]",field,India,"[C Shamshuddin, AK Chaudhary]",Holkar Cricket Stadium,1
1,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Edinburgh,[2018-06-12],male,T20,Pakistan,20,[Sarfraz Ahmed],"[Pakistan, Scotland]",bat,Pakistan,"[AJT Dowdalls, DA Haggo]","Grange Cricket Club, Raeburn Place",2
2,"[{'1st innings': {'team': 'Kenya', 'deliveries...",,[2012-02-23],male,T20,Ireland,20,[PR Stirling],"[Kenya, Ireland]",bat,Kenya,"[S George, D Odhiambo]",Mombasa Sports Club Ground,3
3,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Abu Dhabi,[2017-10-26],male,T20,Pakistan,20,[Usman Shinwari],"[Sri Lanka, Pakistan]",field,Pakistan,"[Shozab Raza, Ahsan Raza]",Sheikh Zayed Stadium,4
4,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Chittagong,[2014-03-31],male,T20,Sri Lanka,20,[HMRKB Herath],"[New Zealand, Sri Lanka]",field,New Zealand,"[Aleem Dar, RJ Tucker]",Zahur Ahmed Chowdhury Stadium,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Thailand', 'deliver...",Bangkok,[2019-02-25],female,T20,Thailand,20,[N Chaiwai],"[Thailand, Malaysia]",field,Malaysia,"[Sharfuddoula, DN Subedi]",Terdthai Cricket Ground,1428
1428,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Al Amarat,[2019-02-13],male,T20,Ireland,20,[PR Stirling],"[Oman, Ireland]",field,Oman,"[Rahul Asher, Ahsan Raza]",Al Amerat Cricket Ground Oman Cricket (Ministr...,1429
1429,"[{'1st innings': {'team': 'West Indies', 'deli...",Trinidad,[2018-10-06],female,T20,South Africa,20,[L Lee],"[West Indies, South Africa]",field,South Africa,"[JS Wilson, JM Williams]","Brian Lara Stadium, Tarouba",1430
1430,"[{'1st innings': {'team': 'India', 'deliveries...",Surat,[2019-10-01],female,T20,India,20,[Poonam Yadav],"[India, South Africa]",field,South Africa,"[Chirra Ravikanthreddy, YC Barde]",Lalabhai Contractor Stadium,1431


In [42]:
print(final_df.columns)

Index(['innings', 'info.city', 'info.dates', 'info.gender', 'info.match_type',
       'info.outcome.winner', 'info.overs', 'info.player_of_match',
       'info.teams', 'info.toss.decision', 'info.toss.winner', 'info.umpires',
       'info.venue', 'match_id'],
      dtype='object')


In [63]:
## Findings number of matches played by women and man
final_df['info.gender'].value_counts()

info.gender
male      966
female    466
Name: count, dtype: int64

In [64]:
## Removing unnecessary data (women matches)
final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.gender'],inplace=True)


Unnamed: 0,innings,info.city,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Indore,[2020-01-07],T20,India,20,[NA Saini],"[India, Sri Lanka]",field,India,"[C Shamshuddin, AK Chaudhary]",Holkar Cricket Stadium,1
1,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Edinburgh,[2018-06-12],T20,Pakistan,20,[Sarfraz Ahmed],"[Pakistan, Scotland]",bat,Pakistan,"[AJT Dowdalls, DA Haggo]","Grange Cricket Club, Raeburn Place",2
2,"[{'1st innings': {'team': 'Kenya', 'deliveries...",,[2012-02-23],T20,Ireland,20,[PR Stirling],"[Kenya, Ireland]",bat,Kenya,"[S George, D Odhiambo]",Mombasa Sports Club Ground,3
3,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Abu Dhabi,[2017-10-26],T20,Pakistan,20,[Usman Shinwari],"[Sri Lanka, Pakistan]",field,Pakistan,"[Shozab Raza, Ahsan Raza]",Sheikh Zayed Stadium,4
4,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Chittagong,[2014-03-31],T20,Sri Lanka,20,[HMRKB Herath],"[New Zealand, Sri Lanka]",field,New Zealand,"[Aleem Dar, RJ Tucker]",Zahur Ahmed Chowdhury Stadium,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,"[{'1st innings': {'team': 'Scotland', 'deliver...",Hong Kong,[2016-01-31],T20,Scotland,20,[KJ Coetzer],"[Hong Kong, Scotland]",field,Hong Kong,"[Sarika Prasad, Tabarak Dar]","Mission Road Ground, Mong Kok",1420
1421,[{'1st innings': {'team': 'United Arab Emirate...,Abu Dhabi,[2016-02-16],T20,United Arab Emirates,20,,"[United Arab Emirates, Ireland]",bat,United Arab Emirates,"[Akbar Ali, Iftikhar Ali]",Sheikh Zayed Stadium,1422
1422,"[{'1st innings': {'team': 'Singapore', 'delive...",Singapore,[2019-07-22],T20,Singapore,20,[Janak Prakash],"[Singapore, Qatar]",bat,Singapore,"[V Kalidas, BB Pradhan]",Indian Association Ground,1423
1428,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Al Amarat,[2019-02-13],T20,Ireland,20,[PR Stirling],"[Oman, Ireland]",field,Oman,"[Rahul Asher, Ahsan Raza]",Al Amerat Cricket Ground Oman Cricket (Ministr...,1429


In [65]:
## further checking data for non t20 matches
final_df['info.match_type'].value_counts()

info.match_type
T20    966
Name: count, dtype: int64

In [66]:
## second level of checking by checking number of 20 over matches
final_df['info.overs'].value_counts()

info.overs
20    963
50      3
Name: count, dtype: int64

In [67]:
## Removing 50 over matches
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,info.city,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Indore,[2020-01-07],India,[NA Saini],"[India, Sri Lanka]",field,India,"[C Shamshuddin, AK Chaudhary]",Holkar Cricket Stadium,1
1,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Edinburgh,[2018-06-12],Pakistan,[Sarfraz Ahmed],"[Pakistan, Scotland]",bat,Pakistan,"[AJT Dowdalls, DA Haggo]","Grange Cricket Club, Raeburn Place",2
2,"[{'1st innings': {'team': 'Kenya', 'deliveries...",,[2012-02-23],Ireland,[PR Stirling],"[Kenya, Ireland]",bat,Kenya,"[S George, D Odhiambo]",Mombasa Sports Club Ground,3
3,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Abu Dhabi,[2017-10-26],Pakistan,[Usman Shinwari],"[Sri Lanka, Pakistan]",field,Pakistan,"[Shozab Raza, Ahsan Raza]",Sheikh Zayed Stadium,4
4,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",Chittagong,[2014-03-31],Sri Lanka,[HMRKB Herath],"[New Zealand, Sri Lanka]",field,New Zealand,"[Aleem Dar, RJ Tucker]",Zahur Ahmed Chowdhury Stadium,5
...,...,...,...,...,...,...,...,...,...,...,...
1419,"[{'1st innings': {'team': 'Scotland', 'deliver...",Hong Kong,[2016-01-31],Scotland,[KJ Coetzer],"[Hong Kong, Scotland]",field,Hong Kong,"[Sarika Prasad, Tabarak Dar]","Mission Road Ground, Mong Kok",1420
1421,[{'1st innings': {'team': 'United Arab Emirate...,Abu Dhabi,[2016-02-16],United Arab Emirates,,"[United Arab Emirates, Ireland]",bat,United Arab Emirates,"[Akbar Ali, Iftikhar Ali]",Sheikh Zayed Stadium,1422
1422,"[{'1st innings': {'team': 'Singapore', 'delive...",Singapore,[2019-07-22],Singapore,[Janak Prakash],"[Singapore, Qatar]",bat,Singapore,"[V Kalidas, BB Pradhan]",Indian Association Ground,1423
1428,"[{'1st innings': {'team': 'Ireland', 'deliveri...",Al Amarat,[2019-02-13],Ireland,[PR Stirling],"[Oman, Ireland]",field,Oman,"[Rahul Asher, Ahsan Raza]",Al Amerat Cricket Ground Oman Cricket (Ministr...,1429


In [68]:
## To prevent data loss pickel the data in a pkl file
pickle.dump(final_df,open('dataset_level1.pkl','wb'))

In [69]:
matches = pickle.load(open('dataset_level1.pkl','rb'))
matches.iloc[0]['innings'][0]['1st innings']['deliveries']


[{0.1: {'non_striker': 'WIA Fernando',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 2, 'total': 2, 'batsman': 0},
   'extras': {'wides': 2},
   'batsman': 'MD Gunathilaka'}},
 {0.2: {'non_striker': 'MD Gunathilaka',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 0, 'total': 4, 'batsman': 4},
   'batsman': 'WIA Fernando'}},
 {0.3: {'non_striker': 'MD Gunathilaka',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'WIA Fernando'}},
 {0.4: {'non_striker': 'MD Gunathilaka',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 0, 'total': 1, 'batsman': 1},
   'batsman': 'WIA Fernando'}},
 {0.5: {'non_striker': 'WIA Fernando',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'MD Gunathilaka'}},
 {0.6: {'non_striker': 'WIA Fernando',
   'bowler': 'JJ Bumrah',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'MD Gunathilaka'}},
 {0.7: {'non_striker': 'WIA Fernando',
   'bowler': 'JJ Bumrah',
  

In [70]:
## Extracting ball by ball data for first innings for each match
delivery_data = []  # List to store data for appending

for index, row in matches.iterrows():
    match_id = index  # Using the index as match_id
    first_innings = row['innings'][0]['1st innings']
    
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    city = []
    venue = []
    
    for delivery in first_innings['deliveries']:
        for ball, ball_info in delivery.items():
            ball_of_match.append(ball)
            batsman.append(ball_info['batsman'])
            bowler.append(ball_info['bowler'])
            runs.append(ball_info['runs']['total'])
            player_of_dismissed.append(ball_info['wicket']['player_out'] if 'wicket' in ball_info else None)
            teams.append(row['info.teams'])
            batting_team.append(first_innings['team'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
    
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue
    })
    
    delivery_data.append(loop_df)

# Create DataFrame from the list of data
delivery_df = pd.concat(delivery_data, ignore_index=True)


In [71]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,0,"[India, Sri Lanka]",Sri Lanka,0.1,MD Gunathilaka,JJ Bumrah,2,,Indore,Holkar Cricket Stadium
1,0,"[India, Sri Lanka]",Sri Lanka,0.2,WIA Fernando,JJ Bumrah,4,,Indore,Holkar Cricket Stadium
2,0,"[India, Sri Lanka]",Sri Lanka,0.3,WIA Fernando,JJ Bumrah,0,,Indore,Holkar Cricket Stadium
3,0,"[India, Sri Lanka]",Sri Lanka,0.4,WIA Fernando,JJ Bumrah,1,,Indore,Holkar Cricket Stadium
4,0,"[India, Sri Lanka]",Sri Lanka,0.5,MD Gunathilaka,JJ Bumrah,0,,Indore,Holkar Cricket Stadium
...,...,...,...,...,...,...,...,...,...,...
116542,1431,"[New Zealand, Pakistan]",New Zealand,17.5,IG Butler,Umar Gul,1,,London,Kennington Oval
116543,1431,"[New Zealand, Pakistan]",New Zealand,17.6,DL Vettori,Umar Gul,1,,London,Kennington Oval
116544,1431,"[New Zealand, Pakistan]",New Zealand,18.1,DL Vettori,Abdul Razzaq,4,,London,Kennington Oval
116545,1431,"[New Zealand, Pakistan]",New Zealand,18.2,DL Vettori,Abdul Razzaq,0,,London,Kennington Oval


In [72]:
'''
The required columns for our model are -: 
batting_team, bowling_team, current_score,wickets_left, crr, city, balls_left and last_five.
'''

# Extracting bowling_team
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team



In [73]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)

In [74]:
delivery_df


Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,0,"[India, Sri Lanka]",Sri Lanka,0.1,MD Gunathilaka,JJ Bumrah,2,,Indore,Holkar Cricket Stadium,India
1,0,"[India, Sri Lanka]",Sri Lanka,0.2,WIA Fernando,JJ Bumrah,4,,Indore,Holkar Cricket Stadium,India
2,0,"[India, Sri Lanka]",Sri Lanka,0.3,WIA Fernando,JJ Bumrah,0,,Indore,Holkar Cricket Stadium,India
3,0,"[India, Sri Lanka]",Sri Lanka,0.4,WIA Fernando,JJ Bumrah,1,,Indore,Holkar Cricket Stadium,India
4,0,"[India, Sri Lanka]",Sri Lanka,0.5,MD Gunathilaka,JJ Bumrah,0,,Indore,Holkar Cricket Stadium,India
...,...,...,...,...,...,...,...,...,...,...,...
116542,1431,"[New Zealand, Pakistan]",New Zealand,17.5,IG Butler,Umar Gul,1,,London,Kennington Oval,Pakistan
116543,1431,"[New Zealand, Pakistan]",New Zealand,17.6,DL Vettori,Umar Gul,1,,London,Kennington Oval,Pakistan
116544,1431,"[New Zealand, Pakistan]",New Zealand,18.1,DL Vettori,Abdul Razzaq,4,,London,Kennington Oval,Pakistan
116545,1431,"[New Zealand, Pakistan]",New Zealand,18.2,DL Vettori,Abdul Razzaq,0,,London,Kennington Oval,Pakistan


In [75]:
## Dropping teams column as now its unnecessary
delivery_df.drop(columns=['teams'],inplace=True)

In [76]:
## Finding number of unique teams.
delivery_df['batting_team'].unique()

array(['Sri Lanka', 'Pakistan', 'Kenya', 'Zimbabwe', 'Canada',
       'Bangladesh', 'West Indies', 'Ireland', 'Afghanistan', 'Australia',
       'Papua New Guinea', 'South Africa', 'Hong Kong', 'England',
       'New Zealand', 'India', 'Cayman Islands', 'United Arab Emirates',
       'Nepal', 'Singapore', 'Netherlands', 'Vanuatu', 'Belgium', 'Italy',
       'Malaysia', 'Uganda', 'Jersey', 'Oman', 'Kuwait', 'Nigeria',
       'Guernsey', 'Scotland', 'Thailand', 'Botswana', 'Denmark',
       'United States of America', 'Bermuda', 'Gibraltar', 'Qatar',
       'Namibia', 'Maldives', 'Norway', 'Isle of Man', 'Germany',
       'Bulgaria', 'Ghana', 'Spain', 'Portugal', 'Bhutan', 'Iran',
       'Romania', 'Philippines'], dtype=object)

In [77]:
## As we know that most of the above teams have very less data which makes model training difficult.
## So we are taking only top 10 teams
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka'    
]


In [78]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]


In [79]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,0,Sri Lanka,0.1,MD Gunathilaka,JJ Bumrah,2,,Indore,Holkar Cricket Stadium,India
1,0,Sri Lanka,0.2,WIA Fernando,JJ Bumrah,4,,Indore,Holkar Cricket Stadium,India
2,0,Sri Lanka,0.3,WIA Fernando,JJ Bumrah,0,,Indore,Holkar Cricket Stadium,India
3,0,Sri Lanka,0.4,WIA Fernando,JJ Bumrah,1,,Indore,Holkar Cricket Stadium,India
4,0,Sri Lanka,0.5,MD Gunathilaka,JJ Bumrah,0,,Indore,Holkar Cricket Stadium,India
...,...,...,...,...,...,...,...,...,...,...
116542,1431,New Zealand,17.5,IG Butler,Umar Gul,1,,London,Kennington Oval,Pakistan
116543,1431,New Zealand,17.6,DL Vettori,Umar Gul,1,,London,Kennington Oval,Pakistan
116544,1431,New Zealand,18.1,DL Vettori,Abdul Razzaq,4,,London,Kennington Oval,Pakistan
116545,1431,New Zealand,18.2,DL Vettori,Abdul Razzaq,0,,London,Kennington Oval,Pakistan


In [80]:
## Creating a new data frame to use for modeling 
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [81]:
output

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,0,Sri Lanka,India,0.1,2,,Indore,Holkar Cricket Stadium
1,0,Sri Lanka,India,0.2,4,,Indore,Holkar Cricket Stadium
2,0,Sri Lanka,India,0.3,0,,Indore,Holkar Cricket Stadium
3,0,Sri Lanka,India,0.4,1,,Indore,Holkar Cricket Stadium
4,0,Sri Lanka,India,0.5,0,,Indore,Holkar Cricket Stadium
...,...,...,...,...,...,...,...,...
116542,1431,New Zealand,Pakistan,17.5,1,,London,Kennington Oval
116543,1431,New Zealand,Pakistan,17.6,1,,London,Kennington Oval
116544,1431,New Zealand,Pakistan,18.1,4,,London,Kennington Oval
116545,1431,New Zealand,Pakistan,18.2,0,,London,Kennington Oval


In [82]:
##Saving this dataframe in pickle ie level 2 
pickle.dump(output,open('dataset_level2.pkl','wb'))


# (i) For ODI

In [66]:
filenames_odis = []
for file in os.listdir('odis'):
    filenames_odis.append(os.path.join('odis', file))

In [67]:
filenames_odis[0:5]

['odis/300441.yaml',
 'odis/643669.yaml',
 'odis/536931.yaml',
 'odis/750665.yaml',
 'odis/1073427.yaml']

In [68]:
final_df_odi = pd.DataFrame()
counter = 1

for file in tqdm(filenames_odis):   ## Loop to iterate through each file
    with open(file, 'r') as f:
        try:
            df_odi = pd.json_normalize(safe_load(f)) ## json_normalizing each file and handelling yaml fils with safe_load()
            df_odi['match_id'] = counter
            final_df_odi = pd.concat([final_df_odi, df_odi], ignore_index=True)
            counter += 1
        except NotImplementedError: ##Exception handelingcfor the file in which Normalization is not implemented 
            print(f"Normalization not implemented for file: {file}")

final_df_odi

 81%|██████████████████████████████████▊        | 1646/2034 [07:12<01:15,  5.11it/s]

Normalization not implemented for file: odis/README.txt


100%|███████████████████████████████████████████| 2034/2034 [08:53<00:00,  3.81it/s]


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.city,info.dates,info.gender,info.match_type,info.outcome.by.runs,info.outcome.method,...,info.supersubs.Australia,info.supersubs.New Zealand,info.supersubs.Zimbabwe,info.outcome.eliminator,info.supersubs.West Indies,info.supersubs.Africa XI,info.supersubs.Asia XI,info.supersubs.Pakistan,info.supersubs.England,info.supersubs.ICC World XI
0,"[{'1st innings': {'team': 'England', 'deliveri...",0.9,2015-08-31,1,Christchurch,[2008-02-23],male,ODI,34.0,D/L,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.9,2013-07-29,1,,[2013-07-28],male,ODI,,,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'West Indies', 'deli...",0.9,2011-12-08,2,Ahmedabad,[2011-12-05],male,ODI,16.0,,...,,,,,,,,,,
3,"[{'1st innings': {'team': 'England', 'deliveri...",0.9,2014-11-29,1,Colombo,[2014-11-29],male,ODI,,,...,,,,,,,,,,
4,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.9,2017-02-19,1,Colombo,[2017-02-19],female,ODI,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,"[{'1st innings': {'team': 'New Zealand', 'deli...",0.9,2015-06-22,1,Chester-le-Street,[2015-06-20],male,ODI,,D/L,...,,,,,,,,,,
2029,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2016-10-12,1,Port Elizabeth,[2016-10-09],male,ODI,,,...,,,,,,,,,,
2030,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2015-03-15,1,,[2015-03-08],male,ODI,64.0,,...,,,,,,,,,,
2031,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.9,2013-03-25,1,Hambantota,[2013-03-23],male,ODI,,D/L,...,,,,,,,,,,


In [69]:
backup_odi = final_df_odi.copy() ##Creating a backup of converted data

In [70]:
## removing unnecessary columns from the data frame

final_df_odi.drop(columns=[
    'info.outcome.by.runs', 
    'info.outcome.by.wickets', 
    'info.supersubs.India', 
    'info.supersubs.Sri Lanka', 
    'info.supersubs.Bangladesh', 
    'info.supersubs.Australia', 
    'info.supersubs.Zimbabwe', 
    'info.supersubs.West Indies', 
    'info.supersubs.Africa XI', 
    'info.supersubs.Asia XI', 
    'info.supersubs.Pakistan', 
    'info.supersubs.England', 
    'info.supersubs.ICC World XI'
],inplace=True)

In [71]:
## removing unnecessary columns from the data frame

final_df_odi.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
],inplace=True)

In [72]:
final_df_odi

Unnamed: 0,innings,info.city,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'England', 'deliveri...",Christchurch,[2008-02-23],male,ODI,New Zealand,50,[BB McCullum],"[New Zealand, England]",field,New Zealand,"[Asad Rauf, BF Bowden]",AMI Stadium,1
1,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",,[2013-07-28],male,ODI,India,50,[A Mishra],"[Zimbabwe, India]",field,India,"[BNJ Oxenford, RB Tiffin]",Harare Sports Club,2
2,"[{'1st innings': {'team': 'West Indies', 'deli...",Ahmedabad,[2011-12-05],male,ODI,West Indies,50,[R Rampaul],"[India, West Indies]",field,India,"[S Asnani, AL Hill]","Sardar Patel Stadium, Motera",3
3,"[{'1st innings': {'team': 'England', 'deliveri...",Colombo,[2014-11-29],male,ODI,Sri Lanka,50,[DPMD Jayawardene],"[Sri Lanka, England]",bat,England,"[SJ Davis, RR Wimalasiri]",R Premadasa Stadium,4
4,"[{'1st innings': {'team': 'Pakistan', 'deliver...",Colombo,[2017-02-19],female,ODI,India,50,[E Bisht],"[India, Pakistan]",field,India,"[CA Polosak, RR Wimalasiri]",P Sara Oval,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2028,"[{'1st innings': {'team': 'New Zealand', 'deli...",Chester-le-Street,[2015-06-20],male,ODI,England,50,[JM Bairstow],"[England, New Zealand]",field,England,"[MA Gough, BNJ Oxenford]",Riverside Ground,2029
2029,"[{'1st innings': {'team': 'Australia', 'delive...",Port Elizabeth,[2016-10-09],male,ODI,South Africa,50,[KJ Abbott],"[South Africa, Australia]",bat,Australia,"[AT Holdstock, NJ Llong]",St George's Park,2030
2030,"[{'1st innings': {'team': 'Australia', 'delive...",,[2015-03-08],male,ODI,Australia,50,[GJ Maxwell],"[Australia, Sri Lanka]",bat,Australia,"[Aleem Dar, IJ Gould]",Sydney Cricket Ground,2031
2031,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",Hambantota,[2013-03-23],male,ODI,Sri Lanka,50,[TM Dilshan],"[Sri Lanka, Bangladesh]",field,Sri Lanka,"[NJ Llong, RSA Palliyaguruge]",Mahinda Rajapaksa International Cricket Stadiu...,2032


In [78]:
## Findings number of matches played by women and man
final_df_odi['info.gender'].value_counts()

info.gender
male      1799
female     234
Name: count, dtype: int64

In [81]:
## Removing unnecessary data (women matches)
# Step 1: Filter the DataFrame based on the 'info.gender' column
final_df_odi = final_df_odi[final_df_odi['info.gender'] == 'male']

# Step 2: Reset the index after filtering
final_df_odi.reset_index(drop=True, inplace=True)

# Step 3: Display the resulting DataFrame
final_df_odi


Unnamed: 0,innings,info.city,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id
0,"[{'1st innings': {'team': 'England', 'deliveri...",Christchurch,[2008-02-23],male,ODI,New Zealand,50,[BB McCullum],"[New Zealand, England]",field,New Zealand,"[Asad Rauf, BF Bowden]",AMI Stadium,1
1,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",,[2013-07-28],male,ODI,India,50,[A Mishra],"[Zimbabwe, India]",field,India,"[BNJ Oxenford, RB Tiffin]",Harare Sports Club,2
2,"[{'1st innings': {'team': 'West Indies', 'deli...",Ahmedabad,[2011-12-05],male,ODI,West Indies,50,[R Rampaul],"[India, West Indies]",field,India,"[S Asnani, AL Hill]","Sardar Patel Stadium, Motera",3
3,"[{'1st innings': {'team': 'England', 'deliveri...",Colombo,[2014-11-29],male,ODI,Sri Lanka,50,[DPMD Jayawardene],"[Sri Lanka, England]",bat,England,"[SJ Davis, RR Wimalasiri]",R Premadasa Stadium,4
4,"[{'1st innings': {'team': 'Kenya', 'deliveries...",St Lucia,[2007-03-24],male,ODI,England,50,[EC Joyce],"[England, Kenya]",bat,Kenya,"[RE Koertzen, PD Parker]","Beausejour Stadium, Gros Islet",6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,"[{'1st innings': {'team': 'New Zealand', 'deli...",Chester-le-Street,[2015-06-20],male,ODI,England,50,[JM Bairstow],"[England, New Zealand]",field,England,"[MA Gough, BNJ Oxenford]",Riverside Ground,2029
1795,"[{'1st innings': {'team': 'Australia', 'delive...",Port Elizabeth,[2016-10-09],male,ODI,South Africa,50,[KJ Abbott],"[South Africa, Australia]",bat,Australia,"[AT Holdstock, NJ Llong]",St George's Park,2030
1796,"[{'1st innings': {'team': 'Australia', 'delive...",,[2015-03-08],male,ODI,Australia,50,[GJ Maxwell],"[Australia, Sri Lanka]",bat,Australia,"[Aleem Dar, IJ Gould]",Sydney Cricket Ground,2031
1797,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",Hambantota,[2013-03-23],male,ODI,Sri Lanka,50,[TM Dilshan],"[Sri Lanka, Bangladesh]",field,Sri Lanka,"[NJ Llong, RSA Palliyaguruge]",Mahinda Rajapaksa International Cricket Stadiu...,2032


In [83]:
## further checking data for non odi matches
final_df_odi['info.match_type'].value_counts()

info.match_type
ODI    1799
Name: count, dtype: int64

In [85]:
## second level of checking by checking number of 50 over matches
final_df_odi['info.overs'].value_counts()

info.overs
50    1799
Name: count, dtype: int64

In [93]:
## To prevent data loss pickel the data in a pkl file
pickle.dump(final_df_odi,open('dataset_level1_odi.pkl','wb'))

In [94]:
matches_odi = pickle.load(open('dataset_level1_odi.pkl','rb'))
matches_odi.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'AN Cook',
   'bowler': 'KD Mills',
   'non_striker': 'P Mustard',
   'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
 {0.2: {'batsman': 'P Mustard',
   'bowler': 'KD Mills',
   'non_striker': 'AN Cook',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'P Mustard',
   'bowler': 'KD Mills',
   'non_striker': 'AN Cook',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.4: {'batsman': 'P Mustard',
   'bowler': 'KD Mills',
   'non_striker': 'AN Cook',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.5: {'batsman': 'P Mustard',
   'bowler': 'KD Mills',
   'non_striker': 'AN Cook',
   'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
 {0.6: {'batsman': 'AN Cook',
   'bowler': 'KD Mills',
   'non_striker': 'P Mustard',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {1.1: {'batsman': 'P Mustard',
   'bowler': 'CS Martin',
   'non_striker': 'AN Cook',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {1.2: {'batsman': 

In [97]:
## Extracting ball by ball data for first innings for each match
delivery_data_odi = []  # List to store data for appending

for index, row in matches_odi.iterrows():
    match_id = index  # Using the index as match_id
    first_innings = row['innings'][0]['1st innings']
    
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    city = []
    venue = []
    
    for delivery in first_innings['deliveries']:
        for ball, ball_info in delivery.items():
            ball_of_match.append(ball)
            batsman.append(ball_info['batsman'])
            bowler.append(ball_info['bowler'])
            runs.append(ball_info['runs']['total'])
            player_of_dismissed.append(ball_info['wicket']['player_out'] if 'wicket' in ball_info else None)
            teams.append(row['info.teams'])
            batting_team.append(first_innings['team'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
    
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue
    })
    
    delivery_data_odi.append(loop_df)

# Create DataFrame from the list of data
delivery_df_odi = pd.concat(delivery_data_odi, ignore_index=True)


In [98]:
delivery_df_odi

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,0,"[New Zealand, England]",England,0.1,AN Cook,KD Mills,1,,Christchurch,AMI Stadium
1,0,"[New Zealand, England]",England,0.2,P Mustard,KD Mills,0,,Christchurch,AMI Stadium
2,0,"[New Zealand, England]",England,0.3,P Mustard,KD Mills,0,,Christchurch,AMI Stadium
3,0,"[New Zealand, England]",England,0.4,P Mustard,KD Mills,0,,Christchurch,AMI Stadium
4,0,"[New Zealand, England]",England,0.5,P Mustard,KD Mills,1,,Christchurch,AMI Stadium
...,...,...,...,...,...,...,...,...,...,...
519641,1798,"[India, Pakistan]",Pakistan,49.2,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium
519642,1798,"[India, Pakistan]",Pakistan,49.3,Mohammad Yousuf,RP Singh,1,,Gwalior,Captain Roop Singh Stadium
519643,1798,"[India, Pakistan]",Pakistan,49.4,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium
519644,1798,"[India, Pakistan]",Pakistan,49.5,Mohammad Yousuf,RP Singh,6,,Gwalior,Captain Roop Singh Stadium


In [99]:
'''
The required columns for our model are -: 
batting_team, bowling_team, current_score,wickets_left, crr, city, balls_left and last_five.
'''

# Extracting bowling_team
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team


In [100]:
delivery_df_odi['bowling_team'] = delivery_df_odi.apply(bowl,axis=1)

In [101]:
delivery_df_odi

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,0,"[New Zealand, England]",England,0.1,AN Cook,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
1,0,"[New Zealand, England]",England,0.2,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
2,0,"[New Zealand, England]",England,0.3,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
3,0,"[New Zealand, England]",England,0.4,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
4,0,"[New Zealand, England]",England,0.5,P Mustard,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
...,...,...,...,...,...,...,...,...,...,...,...
519641,1798,"[India, Pakistan]",Pakistan,49.2,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519642,1798,"[India, Pakistan]",Pakistan,49.3,Mohammad Yousuf,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519643,1798,"[India, Pakistan]",Pakistan,49.4,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519644,1798,"[India, Pakistan]",Pakistan,49.5,Mohammad Yousuf,RP Singh,6,,Gwalior,Captain Roop Singh Stadium,India


In [102]:
## Dropping teams column as now its unnecessary
delivery_df_odi.drop(columns=['teams'],inplace=True)

In [103]:
delivery_df_odi

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,0,England,0.1,AN Cook,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
1,0,England,0.2,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
2,0,England,0.3,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
3,0,England,0.4,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
4,0,England,0.5,P Mustard,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
...,...,...,...,...,...,...,...,...,...,...
519641,1798,Pakistan,49.2,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519642,1798,Pakistan,49.3,Mohammad Yousuf,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519643,1798,Pakistan,49.4,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519644,1798,Pakistan,49.5,Mohammad Yousuf,RP Singh,6,,Gwalior,Captain Roop Singh Stadium,India


In [104]:
## Finding number of unique teams.
delivery_df_odi['batting_team'].unique()

array(['England', 'Zimbabwe', 'West Indies', 'Kenya', 'Sri Lanka',
       'Pakistan', 'Bangladesh', 'India', 'New Zealand', 'South Africa',
       'Australia', 'Netherlands', 'Ireland', 'United Arab Emirates',
       'Scotland', 'Afghanistan', 'United States of America',
       'Papua New Guinea', 'Bermuda', 'Canada', 'Hong Kong', 'Asia XI',
       'Africa XI', 'Namibia', 'Nepal', 'Oman'], dtype=object)

In [105]:
## As we know that most of the above teams have very less data which makes model training difficult.
## So we are taking only top 10 teams
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka'    
]


In [107]:
delivery_df_odi = delivery_df_odi[delivery_df_odi['batting_team'].isin(teams)]
delivery_df_odi = delivery_df_odi[delivery_df_odi['bowling_team'].isin(teams)]

In [108]:
delivery_df_odi

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,0,England,0.1,AN Cook,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
1,0,England,0.2,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
2,0,England,0.3,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
3,0,England,0.4,P Mustard,KD Mills,0,,Christchurch,AMI Stadium,New Zealand
4,0,England,0.5,P Mustard,KD Mills,1,,Christchurch,AMI Stadium,New Zealand
...,...,...,...,...,...,...,...,...,...,...
519641,1798,Pakistan,49.2,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519642,1798,Pakistan,49.3,Mohammad Yousuf,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519643,1798,Pakistan,49.4,Kamran Akmal,RP Singh,1,,Gwalior,Captain Roop Singh Stadium,India
519644,1798,Pakistan,49.5,Mohammad Yousuf,RP Singh,6,,Gwalior,Captain Roop Singh Stadium,India


In [110]:
## Creating a new data frame to use for modeling 
output_odi = delivery_df_odi[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [111]:
output_odi

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,0,England,New Zealand,0.1,1,,Christchurch,AMI Stadium
1,0,England,New Zealand,0.2,0,,Christchurch,AMI Stadium
2,0,England,New Zealand,0.3,0,,Christchurch,AMI Stadium
3,0,England,New Zealand,0.4,0,,Christchurch,AMI Stadium
4,0,England,New Zealand,0.5,1,,Christchurch,AMI Stadium
...,...,...,...,...,...,...,...,...
519641,1798,Pakistan,India,49.2,1,,Gwalior,Captain Roop Singh Stadium
519642,1798,Pakistan,India,49.3,1,,Gwalior,Captain Roop Singh Stadium
519643,1798,Pakistan,India,49.4,1,,Gwalior,Captain Roop Singh Stadium
519644,1798,Pakistan,India,49.5,6,,Gwalior,Captain Roop Singh Stadium


In [114]:
##Saving this dataframe in pickle ie level 2 
pickle.dump(output_odi,open('dataset_level2_odi.pkl','wb'))
