In [6]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [7]:
filenames = []
for file in os.listdir("data"):
    filenames.append(os.path.join('data',file))

In [8]:
filenames[0:5]

['data\\1001349.yaml',
 'data\\1001351.yaml',
 'data\\1001353.yaml',
 'data\\1004729.yaml',
 'data\\1007655.yaml']

In [11]:
import os
import pandas as pd
from tqdm import tqdm
import yaml

# Function to safely load a YAML file
def safe_load_yaml(file):
    try:
        with open(file, 'r') as f:
            return yaml.safe_load(f)  # Loads YAML data safely
    except Exception as e:
        print(f"Error loading {file}: {e}")
        return None

# Get the list of filenames in the 'data' directory
filenames = [os.path.join('data', file) for file in os.listdir('data')]

# List to hold DataFrames
final_df_list = []  # We will collect DataFrames in this list
counter = 1

# Loop over all files in the 'filenames' list
for file in tqdm(filenames, desc="Processing YAML files", total=len(filenames)):
    data = safe_load_yaml(file)
    if data:  # Proceed if the file loaded successfully
        # Normalize the nested YAML data into a DataFrame
        df = pd.json_normalize(data)
        
        # Add a 'match_id' column to track each match
        df['match_id'] = counter
        
        # Append the DataFrame to the list
        final_df_list.append(df)
        counter += 1

# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(final_df_list, ignore_index=True)

# Show the first few rows of the final DataFrame
final_df.head()


Processing YAML files: 100%|███████████████████████████████████████████████████████| 1432/1432 [03:35<00:00,  6.65it/s]


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,info.overs,...,info.outcome.by.runs,info.match_type_number,info.neutral_venue,info.outcome.method,info.outcome.result,info.outcome.eliminator,info.supersubs.New Zealand,info.supersubs.South Africa,info.bowl_out,info.outcome.bowl_out
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-18,2,[2017-02-17],male,T20,5.0,Sri Lanka,20,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-19,2,[2017-02-19],male,T20,2.0,Sri Lanka,20,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-23,1,[2017-02-22],male,T20,,Australia,20,...,41.0,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.9,2016-09-12,1,[2016-09-05],male,T20,,Hong Kong,20,...,40.0,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.9,2016-06-19,1,[2016-06-18],male,T20,,Zimbabwe,20,...,2.0,,,,,,,,,


In [12]:
final_df.shape

(1432, 28)

In [13]:
final_df


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,info.overs,...,info.outcome.by.runs,info.match_type_number,info.neutral_venue,info.outcome.method,info.outcome.result,info.outcome.eliminator,info.supersubs.New Zealand,info.supersubs.South Africa,info.bowl_out,info.outcome.bowl_out
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-18,2,[2017-02-17],male,T20,5.0,Sri Lanka,20,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-19,2,[2017-02-19],male,T20,2.0,Sri Lanka,20,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-23,1,[2017-02-22],male,T20,,Australia,20,...,41.0,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.9,2016-09-12,1,[2016-09-05],male,T20,,Hong Kong,20,...,40.0,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.9,2016-06-19,1,[2016-06-18],male,T20,,Zimbabwe,20,...,2.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.9,2016-03-05,2,[2016-03-04],male,T20,6.0,Pakistan,20,...,,,1.0,,,,,,,
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.9,2016-03-08,1,[2016-03-06],male,T20,8.0,India,20,...,,,,,,,,,,
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",0.9,2016-02-03,1,[2016-02-03],male,T20,,Netherlands,20,...,84.0,,,,,,,,,
1430,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2016-09-12,1,[2016-09-06],male,T20,,Australia,20,...,85.0,,,,,,,,,


In [14]:
backup = final_df.copy()

In [15]:
final_df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets'
],inplace=True)

In [16]:
final_df

Unnamed: 0,innings,info.dates,info.gender,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],male,T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],male,T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],male,T20,Australia,20,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],male,T20,Hong Kong,20,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],male,T20,Zimbabwe,20,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],male,T20,Pakistan,20,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,1428,Mirpur
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],male,T20,India,20,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,1429,Mirpur
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],male,T20,Netherlands,20,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,1430,Dubai
1430,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],male,T20,Australia,20,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,1431,


In [17]:
final_df['info.gender'].value_counts()

info.gender
male      966
female    466
Name: count, dtype: int64

In [18]:
final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.gender'],inplace=True)


Unnamed: 0,innings,info.dates,info.match_type,info.outcome.winner,info.overs,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],T20,Sri Lanka,20,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],T20,Australia,20,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],T20,Hong Kong,20,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],T20,Zimbabwe,20,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],T20,Pakistan,20,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,1428,Mirpur
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],T20,India,20,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,1429,Mirpur
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],T20,Netherlands,20,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,1430,Dubai
1430,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],T20,Australia,20,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,1431,


In [19]:
final_df['info.match_type'].value_counts()

info.match_type
T20    966
Name: count, dtype: int64

In [20]:
final_df['info.overs'].value_counts()

info.overs
20    963
50      3
Name: count, dtype: int64

In [21]:
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],Australia,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],Hong Kong,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],Zimbabwe,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],Pakistan,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,1428,Mirpur
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],India,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,1429,Mirpur
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],Netherlands,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,1430,Dubai
1430,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],Australia,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,1431,


In [23]:
final_df.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.2: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.3: {'batsman': 'AJ Finch',
   'bowler': 'SL Malinga',
   'non_striker': 'M Klinger',
   'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
 {0.4: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 2, 'extras': 0, 'total': 2}}},
 {0.5: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
 {0.6: {'batsman': 'M Klinger',
   'bowler': 'SL Malinga',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 3, 'extras': 0, 'total': 3}}},
 {1.1: {'batsman': 'M Klinger',
   'bowler': 'KMDN Kulasekara',
   'non_striker': 'AJ Finch',
   'runs': {'batsman': 0, 'extras': 0, 'total': 

In [25]:
import pandas as pd

# Set of match IDs to exclude
excluded_match_ids = {75, 108, 150, 180, 268, 360, 443, 458, 584, 748, 982, 1052, 1111, 1226, 1345}

# List to accumulate data
delivery_df_list = []
count = 1

# Loop through each row in final_df
for index, row in final_df.iterrows():
    # Skip matches that are in the excluded list
    if count in excluded_match_ids:
        count += 1
        continue
    
    # Initialize lists for the current match's data
    ball_of_match, batsman, bowler, runs, player_of_dismissed, teams, batting_team, match_id, city, venue = [], [], [], [], [], [], [], [], [], []
    
    # Loop through the deliveries of the first innings
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key, ball_data in ball.items():
            # Append the data for each delivery
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball_data['batsman'])
            bowler.append(ball_data['bowler'])
            runs.append(ball_data['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            
            # Handle player dismissal (if any)
            player_of_dismissed.append(ball_data.get('wicket', {}).get('player_out', '0'))
    
    # Create a DataFrame for the current match's deliveries
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue
    })
    
    # Append to the list of DataFrames
    delivery_df_list.append(loop_df)
    
    # Increment the match counter
    count += 1

# Concatenate all DataFrames in the list to create the final delivery DataFrame
delivery_df = pd.concat(delivery_df_list, ignore_index=True)

# Display the first few rows of the resulting DataFrame
delivery_df.head()


Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,1,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
1,1,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
2,1,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground
3,1,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground
4,1,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground


In [26]:
delivery_df.shape

(115325, 10)

In [27]:
def bowl(row):
    for team in row['teams']:
        if team != row['batting_team']:
            return team

In [28]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...
115320,963,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
115321,963,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
115322,963,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
115323,963,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [29]:
delivery_df.drop(columns=['teams'],inplace=True)

In [30]:
delivery_df['batting_team'].unique()

array(['Australia', 'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh',
       'New Zealand', 'South Africa', 'England', 'West Indies', 'Ireland',
       'Afghanistan', 'Pakistan', 'United Arab Emirates', 'Scotland',
       'Oman', 'Papua New Guinea', 'Sri Lanka', 'Netherlands', 'Nepal',
       'Vanuatu', 'Philippines', 'United States of America', 'Germany',
       'Ghana', 'Uganda', 'Kenya', 'Namibia', 'Nigeria', 'Botswana',
       'Guernsey', 'Denmark', 'Jersey', 'Italy', 'Norway', 'Thailand',
       'Malaysia', 'Maldives', 'Singapore', 'Kuwait', 'Bermuda', 'Canada',
       'Cayman Islands', 'Portugal', 'Gibraltar', 'Spain', 'Bhutan',
       'Qatar', 'Iran', 'Belgium', 'Isle of Man', 'Bulgaria', 'Romania'],
      dtype=object)

In [31]:
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka',
    'Zimbabwe',
    'Ireland',
    'Scotland'
]

In [32]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
115321,963,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
115322,963,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
115323,963,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [58]:
output1 = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [59]:
output1

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,1,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,1,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,1,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,1,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,1,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [60]:
output1[output1['city'].isnull()]['venue'].value_counts()

venue
Dubai International Cricket Stadium        3798
Harare Sports Club                         2731
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sharjah Cricket Stadium                    1130
Sydney Cricket Ground                       749
Adelaide Oval                               498
Rawalpindi Cricket Stadium                  368
Sylhet International Cricket Stadium        128
Sylhet Stadium                              121
Carrara Oval                                 64
Name: count, dtype: int64

In [61]:
x = np.where(output1['city'].isnull(), output1['venue'].str.split().apply(lambda x:x[0]), output1['city'])

In [62]:
output1['city'] = x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output1['city'] = x


In [63]:
output1.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [64]:
output1.drop(columns=['venue'],inplace=True)
output1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output1.drop(columns=['venue'],inplace=True)


Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo


In [65]:
total_df = output1.groupby('match_id').sum()['runs'].reset_index()
output1 = output1.merge(total_df,on='match_id')

In [66]:
output1

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168
...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,19.3,1,0,Colombo,128
77856,963,Sri Lanka,Australia,19.4,0,0,Colombo,128
77857,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,128
77858,963,Sri Lanka,Australia,19.6,2,0,Colombo,128


In [67]:
output1['current_score'] = output1.groupby('match_id')['runs_x'].cumsum()

In [68]:
output1


Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3
...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,19.3,1,0,Colombo,128,125
77856,963,Sri Lanka,Australia,19.4,0,0,Colombo,128,125
77857,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,128,125
77858,963,Sri Lanka,Australia,19.6,2,0,Colombo,128,127


In [69]:
output1['over'] = output1['ball'].apply(lambda x:str(x).split(".")[0])
output1['ball_no'] = output1['ball'].apply(lambda x:str(x).split(".")[1])

In [70]:
output1.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score,over,ball_no
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0,0,1
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0,0,2
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1,0,3
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3,0,4
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3,0,5


In [71]:
output1['balls_bowled'] = (output1['over'].astype('int')*6) + output1['ball_no'].astype('int')

In [72]:
output1.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score,over,ball_no,balls_bowled
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0,0,1,1
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0,0,2,2
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1,0,3,3
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3,0,4,4
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3,0,5,5


In [73]:
output1['crr'] = round((output1['current_score']*6)/output1['balls_bowled'],2)

In [74]:
output1['player_dismissed'] = output1['player_dismissed'].apply(lambda x:0 if x=='0' else 1)
output1['player_dismissed'] = output1['player_dismissed'].astype('int')
output1

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score,over,ball_no,balls_bowled,crr
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0,0,1,1,0.00
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0,0,2,2,0.00
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1,0,3,3,2.00
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3,0,4,4,4.50
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3,0,5,5,3.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,19.3,1,0,Colombo,128,125,19,3,117,6.41
77856,963,Sri Lanka,Australia,19.4,0,0,Colombo,128,125,19,4,118,6.36
77857,963,Sri Lanka,Australia,19.5,0,1,Colombo,128,125,19,5,119,6.30
77858,963,Sri Lanka,Australia,19.6,2,0,Colombo,128,127,19,6,120,6.35


In [75]:
output1['player_dismissed1'] = output1.groupby('match_id')['player_dismissed'].cumsum()


In [76]:
output1

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score,over,ball_no,balls_bowled,crr,player_dismissed1
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0,0,1,1,0.00,0
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0,0,2,2,0.00,0
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1,0,3,3,2.00,0
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3,0,4,4,4.50,0
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3,0,5,5,3.60,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,19.3,1,0,Colombo,128,125,19,3,117,6.41,8
77856,963,Sri Lanka,Australia,19.4,0,0,Colombo,128,125,19,4,118,6.36,8
77857,963,Sri Lanka,Australia,19.5,0,1,Colombo,128,125,19,5,119,6.30,9
77858,963,Sri Lanka,Australia,19.6,2,0,Colombo,128,127,19,6,120,6.35,9


In [77]:
output1['wickets_left'] = (10 - output1['player_dismissed1'])

In [78]:
output1

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs_x,player_dismissed,city,runs_y,current_score,over,ball_no,balls_bowled,crr,player_dismissed1,wickets_left
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,168,0,0,1,1,0.00,0,10
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,168,0,0,2,2,0.00,0,10
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,168,1,0,3,3,2.00,0,10
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,168,3,0,4,4,4.50,0,10
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,168,3,0,5,5,3.60,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,19.3,1,0,Colombo,128,125,19,3,117,6.41,8,2
77856,963,Sri Lanka,Australia,19.4,0,0,Colombo,128,125,19,4,118,6.36,8,2
77857,963,Sri Lanka,Australia,19.5,0,1,Colombo,128,125,19,5,119,6.30,9,1
77858,963,Sri Lanka,Australia,19.6,2,0,Colombo,128,127,19,6,120,6.35,9,1


In [102]:
final_df3 = output1[['match_id','batting_team','bowling_team','runs_x','current_score','balls_bowled','wickets_left','crr','city','runs_y']]

In [103]:
final_df3['balls_left'] = 120 - final_df3['balls_bowled']
final_df3['balls_left'] = final_df3['balls_left'].apply(lambda x:0 if x<0 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df3['balls_left'] = 120 - final_df3['balls_bowled']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df3['balls_left'] = final_df3['balls_left'].apply(lambda x:0 if x<0 else x)


In [104]:
final_df3['crr'] = round((final_df3['current_score']*6)/final_df3['balls_bowled'],2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df3['crr'] = round((final_df3['current_score']*6)/final_df3['balls_bowled'],2)


In [105]:
final_df3

Unnamed: 0,match_id,batting_team,bowling_team,runs_x,current_score,balls_bowled,wickets_left,crr,city,runs_y,balls_left
0,1,Australia,Sri Lanka,0,0,1,10,0.00,Melbourne,168,119
1,1,Australia,Sri Lanka,0,0,2,10,0.00,Melbourne,168,118
2,1,Australia,Sri Lanka,1,1,3,10,2.00,Melbourne,168,117
3,1,Australia,Sri Lanka,2,3,4,10,4.50,Melbourne,168,116
4,1,Australia,Sri Lanka,0,3,5,10,3.60,Melbourne,168,115
...,...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,1,125,117,2,6.41,Colombo,128,3
77856,963,Sri Lanka,Australia,0,125,118,2,6.36,Colombo,128,2
77857,963,Sri Lanka,Australia,0,125,119,1,6.30,Colombo,128,1
77858,963,Sri Lanka,Australia,2,127,120,1,6.35,Colombo,128,0


In [106]:
final_df3.drop(columns=['balls_bowled'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df3.drop(columns=['balls_bowled'],inplace=True)


In [107]:
final_df3

Unnamed: 0,match_id,batting_team,bowling_team,runs_x,current_score,wickets_left,crr,city,runs_y,balls_left
0,1,Australia,Sri Lanka,0,0,10,0.00,Melbourne,168,119
1,1,Australia,Sri Lanka,0,0,10,0.00,Melbourne,168,118
2,1,Australia,Sri Lanka,1,1,10,2.00,Melbourne,168,117
3,1,Australia,Sri Lanka,2,3,10,4.50,Melbourne,168,116
4,1,Australia,Sri Lanka,0,3,10,3.60,Melbourne,168,115
...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,1,125,2,6.41,Colombo,128,3
77856,963,Sri Lanka,Australia,0,125,2,6.36,Colombo,128,2
77857,963,Sri Lanka,Australia,0,125,1,6.30,Colombo,128,1
77858,963,Sri Lanka,Australia,2,127,1,6.35,Colombo,128,0


In [108]:
eligible_cities = final_df3['city'].value_counts()[final_df3['city'].value_counts() > 200].index.tolist()
final_df3 = final_df3[final_df3['city'].isin(eligible_cities)]

In [109]:
final_df3

Unnamed: 0,match_id,batting_team,bowling_team,runs_x,current_score,wickets_left,crr,city,runs_y,balls_left
0,1,Australia,Sri Lanka,0,0,10,0.00,Melbourne,168,119
1,1,Australia,Sri Lanka,0,0,10,0.00,Melbourne,168,118
2,1,Australia,Sri Lanka,1,1,10,2.00,Melbourne,168,117
3,1,Australia,Sri Lanka,2,3,10,4.50,Melbourne,168,116
4,1,Australia,Sri Lanka,0,3,10,3.60,Melbourne,168,115
...,...,...,...,...,...,...,...,...,...,...
77855,963,Sri Lanka,Australia,1,125,2,6.41,Colombo,128,3
77856,963,Sri Lanka,Australia,0,125,2,6.36,Colombo,128,2
77857,963,Sri Lanka,Australia,0,125,1,6.30,Colombo,128,1
77858,963,Sri Lanka,Australia,2,127,1,6.35,Colombo,128,0


In [110]:
final_df4=final_df3.copy()

In [111]:
X = final_df3.drop(columns=['match_id','runs_x','runs_y'])
y = final_df3['runs_y']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [112]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [113]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [114]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))



0.9652022603400091
2.2938976950334706


In [116]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [119]:
final_df3['bowling_team'].unique()

array(['Sri Lanka', 'India', 'Zimbabwe', 'New Zealand', 'Bangladesh',
       'England', 'South Africa', 'Afghanistan', 'Ireland', 'Pakistan',
       'West Indies', 'Australia', 'Scotland'], dtype=object)

In [120]:
final_df3['city'].unique()


array(['Melbourne', 'Adelaide', 'Harare', 'Napier', 'Mount Maunganui',
       'Auckland', 'Southampton', 'Cardiff', 'Chester-le-Street',
       'Nagpur', 'Bangalore', 'Greater Noida', 'Lauderhill', 'Dubai',
       'Abu Dhabi', 'Sydney', 'Hobart', 'Wellington', 'Hamilton',
       'Bloemfontein', 'Potchefstroom', 'Barbados', 'Trinidad', 'Colombo',
       'St Kitts', 'Jamaica', 'Nelson', 'Ranchi', 'Birmingham',
       'Manchester', 'Bristol', 'Delhi', 'Rajkot', 'Lahore',
       'Johannesburg', 'Centurion', 'Cape Town', 'Cuttack', 'Indore',
       'Mumbai', 'Edinburgh', 'Dhaka', 'Sylhet', 'Sharjah', 'Karachi',
       'Dublin', 'Deventer', 'East London', 'Brisbane', 'Dehradun',
       'Bready', 'Kolkata', 'Lucknow', 'Chennai', 'Basseterre',
       'Dehra Dun', 'Visakhapatnam', 'Bengaluru', 'Canberra', 'Perth',
       'Durban', 'Port Elizabeth', 'Chandigarh', 'Christchurch', 'Kandy',
       'Chattogram', 'Pune', 'Rawalpindi', 'London', 'Nottingham',
       'King City', 'Guyana', 'St Lucia', 