#Import necessary libraries


In [1]:
import numpy as np

import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

# Collect the full paths of all files in the specified directory into a list


In [2]:
filenames = []
for file in os.listdir('/content/drive/MyDrive/Project/T20_run_prediction/t20s'):
  filenames.append(os.path.join('/content/drive/MyDrive/Project/T20_run_prediction/t20s',file))
#

# Get the first five file paths from the filenames list


In [3]:
filenames[0:5]

['/content/drive/MyDrive/Project/T20_run_prediction/t20s/1185188.yaml',
 '/content/drive/MyDrive/Project/T20_run_prediction/t20s/1185186.yaml',
 '/content/drive/MyDrive/Project/T20_run_prediction/t20s/1184262.yaml',
 '/content/drive/MyDrive/Project/T20_run_prediction/t20s/1185181.yaml',
 '/content/drive/MyDrive/Project/T20_run_prediction/t20s/1185316.yaml']

#Read and normalize JSON data from each file in 'filenames' into a DataFrame, handling different JSON structures (single dictionary or list of dictionaries),
# and appending them to 'final_df'; includes error handling for unsupported formats or unexpected exceptions

In [4]:
final_df = pd.DataFrame()
count = 0
for file in tqdm(filenames):
    with open(file, 'r') as f:
        try:
            data = safe_load(f)
            df = pd.json_normalize(data)  # Try normalizing directly
        except NotImplementedError:
            try:
                # Handle case where data is a list of dictionaries
                df = pd.DataFrame()
                for item in data:
                    temp_df = pd.json_normalize(item)
                    df = pd.concat([df, temp_df], ignore_index=True)
            except Exception as e:
                print(f"Error processing file {file}: {e}")
                continue  # Skip to the next file

        df['match_id'] = count
        final_df = pd.concat([final_df, df], ignore_index=True)
        count += 1

 69%|██████▊   | 990/1443 [06:02<02:47,  2.70it/s]

Error processing file /content/drive/MyDrive/Project/T20_run_prediction/t20s/README.txt: 


100%|██████████| 1443/1443 [08:25<00:00,  2.86it/s]


# Create a copy of 'final_df' to preserve the original DataFrame in 'backup' for potential restoration or further analysis


In [5]:
backup = final_df.copy()

# Display the consolidated DataFrame containing normalized data from all JSON files


In [6]:
final_df

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.venue,info.dates,info.gender,info.teams,info.outcome.winner,info.outcome.by.wickets,...,match_id,info.outcome.by.runs,info.outcome.result,info.outcome.eliminator,info.outcome.method,info.bowl_out,info.outcome.bowl_out,info.neutral_venue,info.supersubs.New Zealand,info.supersubs.South Africa
0,"[{'1st innings': {'team': 'Norway', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],male,"[Germany, Norway]",Germany,7.0,...,0,,,,,,,,,
1,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-20,1,King George V Sports Ground,[2019-06-16],male,"[Jersey, Norway]",Jersey,,...,1,80.0,,,,,,,,
2,"[{'1st innings': {'team': 'Namibia', 'deliveri...",0.9,2019-09-15,1,Kyambogo Cricket Oval,[2019-05-21],male,"[Uganda, Namibia]",Namibia,,...,2,42.0,,,,,,,,
3,"[{'1st innings': {'team': 'Guernsey', 'deliver...",0.9,2019-06-16,2,King George V Sports Ground,[2019-06-15],male,"[Guernsey, Germany]",Germany,5.0,...,3,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2020-02-22,1,The Wanderers Stadium,[2020-02-21],male,"[South Africa, Australia]",Australia,,...,4,107.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-23,1,College Field,[2019-06-19],male,"[Italy, Jersey]",Jersey,,...,1437,73.0,,,,,,,,
1438,"[{'1st innings': {'team': 'West Indies', 'deli...",0.9,2019-08-27,1,Sydney Parade,[2019-05-28],female,"[Ireland, West Indies]",West Indies,,...,1438,45.0,,,,,,,,
1439,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],male,"[Germany, Jersey]",Germany,3.0,...,1439,,,,,,,,,
1440,"[{'1st innings': {'team': 'Uganda', 'deliverie...",0.9,2019-08-28,1,Lugogo Cricket Oval,[2019-05-20],male,"[Uganda, Botswana]",Uganda,,...,1440,52.0,,,,,,,,


# Count the occurrences of each unique value in the 'info.gender' column to analyze the distribution of gender in the data


In [7]:
final_df['info.gender'].value_counts()

Unnamed: 0_level_0,count
info.gender,Unnamed: 1_level_1
male,966
female,476


# Filter the DataFrame to include only rows where 'info.gender' is 'male', then drop the 'info.gender' column


In [8]:
final_df = final_df[final_df['info.gender'] == 'male']
final_df.drop(columns=['info.gender'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.gender'],inplace=True)


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.venue,info.dates,info.teams,info.outcome.winner,info.outcome.by.wickets,info.toss.decision,...,match_id,info.outcome.by.runs,info.outcome.result,info.outcome.eliminator,info.outcome.method,info.bowl_out,info.outcome.bowl_out,info.neutral_venue,info.supersubs.New Zealand,info.supersubs.South Africa
0,"[{'1st innings': {'team': 'Norway', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],"[Germany, Norway]",Germany,7.0,field,...,0,,,,,,,,,
1,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-20,1,King George V Sports Ground,[2019-06-16],"[Jersey, Norway]",Jersey,,bat,...,1,80.0,,,,,,,,
2,"[{'1st innings': {'team': 'Namibia', 'deliveri...",0.9,2019-09-15,1,Kyambogo Cricket Oval,[2019-05-21],"[Uganda, Namibia]",Namibia,,bat,...,2,42.0,,,,,,,,
3,"[{'1st innings': {'team': 'Guernsey', 'deliver...",0.9,2019-06-16,2,King George V Sports Ground,[2019-06-15],"[Guernsey, Germany]",Germany,5.0,field,...,3,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2020-02-22,1,The Wanderers Stadium,[2020-02-21],"[South Africa, Australia]",Australia,,field,...,4,107.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.9,2019-11-06,1,Manuka Oval,[2019-11-05],"[Australia, Pakistan]",Australia,7.0,bat,...,1436,,,,,,,,,
1437,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-23,1,College Field,[2019-06-19],"[Italy, Jersey]",Jersey,,field,...,1437,73.0,,,,,,,,
1439,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],"[Germany, Jersey]",Germany,3.0,field,...,1439,,,,,,,,,
1440,"[{'1st innings': {'team': 'Uganda', 'deliverie...",0.9,2019-08-28,1,Lugogo Cricket Oval,[2019-05-20],"[Uganda, Botswana]",Uganda,,field,...,1440,52.0,,,,,,,,


# Count the occurrences of each unique value in the 'info.match_type' column to analyze the distribution of match types in the filtered data
#In-short total T20 game played by male


In [9]:
final_df['info.match_type'].value_counts()

Unnamed: 0_level_0,count
info.match_type,Unnamed: 1_level_1
T20,966


# Count the occurrences of each unique value in the 'info.overs' column to analyze the distribution of overs in the filtered data
#In-short total overs bowled


In [10]:
final_df['info.overs'].value_counts()

Unnamed: 0_level_0,count
info.overs,Unnamed: 1_level_1
20,963
50,3


# Filter the DataFrame to include only rows where 'info.overs' is 20, then drop the 'info.overs' and 'info.match_type' columns


In [11]:
final_df = final_df[final_df['info.overs'] == 20]
final_df.drop(columns=['info.overs','info.match_type'],inplace=True)
final_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['info.overs','info.match_type'],inplace=True)


Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.venue,info.dates,info.teams,info.outcome.winner,info.outcome.by.wickets,info.toss.decision,...,match_id,info.outcome.by.runs,info.outcome.result,info.outcome.eliminator,info.outcome.method,info.bowl_out,info.outcome.bowl_out,info.neutral_venue,info.supersubs.New Zealand,info.supersubs.South Africa
0,"[{'1st innings': {'team': 'Norway', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],"[Germany, Norway]",Germany,7.0,field,...,0,,,,,,,,,
1,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-20,1,King George V Sports Ground,[2019-06-16],"[Jersey, Norway]",Jersey,,bat,...,1,80.0,,,,,,,,
2,"[{'1st innings': {'team': 'Namibia', 'deliveri...",0.9,2019-09-15,1,Kyambogo Cricket Oval,[2019-05-21],"[Uganda, Namibia]",Namibia,,bat,...,2,42.0,,,,,,,,
3,"[{'1st innings': {'team': 'Guernsey', 'deliver...",0.9,2019-06-16,2,King George V Sports Ground,[2019-06-15],"[Guernsey, Germany]",Germany,5.0,field,...,3,,,,,,,,,
4,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2020-02-22,1,The Wanderers Stadium,[2020-02-21],"[South Africa, Australia]",Australia,,field,...,4,107.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,"[{'1st innings': {'team': 'Pakistan', 'deliver...",0.9,2019-11-06,1,Manuka Oval,[2019-11-05],"[Australia, Pakistan]",Australia,7.0,bat,...,1436,,,,,,,,,
1437,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-23,1,College Field,[2019-06-19],"[Italy, Jersey]",Jersey,,field,...,1437,73.0,,,,,,,,
1439,"[{'1st innings': {'team': 'Jersey', 'deliverie...",0.9,2019-06-22,1,College Field,[2019-06-20],"[Germany, Jersey]",Germany,3.0,field,...,1439,,,,,,,,,
1440,"[{'1st innings': {'team': 'Uganda', 'deliverie...",0.9,2019-08-28,1,Lugogo Cricket Oval,[2019-05-20],"[Uganda, Botswana]",Uganda,,field,...,1440,52.0,,,,,,,,


# Serialize and save the 'final_df' DataFrame to a file named 'final_df.pkl' using pickle


In [12]:
import pickle
pickle.dump(final_df,open('final_df.pkl','wb'))

# Load the DataFrame 'final_df' from 'final_df.pkl' using pickle and access the 'deliveries' data for the first innings of the first match


In [13]:
matches = pickle.load(open('final_df.pkl','rb'))
matches.iloc[0]['innings'][0]['1st innings']['deliveries']

[{0.1: {'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'non_striker': 'P Agnihotri',
   'batsman': 'Raza Iqbal'}},
 {0.2: {'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 1, 'batsman': 1},
   'non_striker': 'P Agnihotri',
   'batsman': 'Raza Iqbal'}},
 {0.3: {'non_striker': 'Raza Iqbal',
   'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'P Agnihotri'}},
 {0.4: {'non_striker': 'Raza Iqbal',
   'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'P Agnihotri'}},
 {0.5: {'non_striker': 'Raza Iqbal',
   'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'P Agnihotri'}},
 {0.6: {'non_striker': 'Raza Iqbal',
   'bowler': 'CAJ Meschede',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'P Agnihotri'}},
 {1.1: {'non_striker': 'P Agnihotri',
   'bowler': 'Izatullah Dawlatzai',
   'runs': {'extras': 0, 'total'

# Process each row in 'matches' to extract delivery data, skipping specific match indices, and compile it into 'delivery_df'; handles missing 'wicket' keys by appending '0' for undismissed players


In [14]:
count=1
delivery_df = pd.DataFrame()
for index , row in matches.iterrows():
  if count in [75,108,150,180,268,360,443,458,583,982,1052,1111,1226,1345]:
    count+=1
    continue
  count+=1
  ball_of_match = []
  batsman = []
  bowler = []
  runs = []
  player_of_dismis=[]
  teams = []
  batting_team = []
  match_id = []
  city = []
  venue = []
  for ball in row['innings'][0]['1st innings']['deliveries']:
    for key in ball.keys():
      match_id.append(count)
      batting_team.append(row['innings'][0]['1st innings']['team'])
      teams.append(row['info.teams'])
      ball_of_match.append(key)
      batsman.append(ball[key]['batsman'])
      bowler.append(ball[key]['bowler'])
      runs.append(ball[key]['runs']['total'])
      city.append(row['info.city'])
      venue.append(row['info.venue'])
      try:
        player_of_dismis.append(ball[key]['wicket']['player_out'])
      except:
        player_of_dismis.append('0')
  loop_df = pd.DataFrame({
    'match_id':match_id,
    'teams':teams,
    'batting_team':batting_team,
    'ball':ball_of_match,
    'batsman':batsman,
    'bowler':bowler,
    'runs':runs,
    'player_dismissed':player_of_dismis,
    'city':city,
    'venue':venue
  })

  delivery_df = pd.concat([delivery_df,loop_df],ignore_index=True)


# Display the DataFrame 'delivery_df' containing aggregated delivery-level data from multiple matches


In [15]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,2,"[Germany, Norway]",Norway,0.1,Raza Iqbal,CAJ Meschede,0,0,St Peter Port,College Field
1,2,"[Germany, Norway]",Norway,0.2,Raza Iqbal,CAJ Meschede,1,0,St Peter Port,College Field
2,2,"[Germany, Norway]",Norway,0.3,P Agnihotri,CAJ Meschede,0,0,St Peter Port,College Field
3,2,"[Germany, Norway]",Norway,0.4,P Agnihotri,CAJ Meschede,0,0,St Peter Port,College Field
4,2,"[Germany, Norway]",Norway,0.5,P Agnihotri,CAJ Meschede,0,0,St Peter Port,College Field
...,...,...,...,...,...,...,...,...,...,...
115425,964,"[South Africa, England]",South Africa,19.2,BE Hendricks,CJ Jordan,0,BE Hendricks,East London,Buffalo Park
115426,964,"[South Africa, England]",South Africa,19.3,DW Steyn,CJ Jordan,4,0,East London,Buffalo Park
115427,964,"[South Africa, England]",South Africa,19.4,DW Steyn,CJ Jordan,1,0,East London,Buffalo Park
115428,964,"[South Africa, England]",South Africa,19.5,AL Phehlukwayo,CJ Jordan,0,0,East London,Buffalo Park


# Function to determine the bowling team by returning the team from 'teams' list that is not the 'batting_team'


In [16]:
def bowl(row):
  for team in row['teams']:
    if team != row['batting_team']:
      return team


# Apply the 'bowl' function to each row of 'delivery_df' to determine the bowling team and store the result in a new column 'bowling_team'


In [17]:
delivery_df['bowling_team'] = delivery_df.apply(bowl,axis=1)

# Drop the 'teams' column from 'delivery_df' as it is no longer needed after determining the bowling team


In [18]:
delivery_df.drop(columns=['teams'],inplace=True)




# Get an array of unique values in the 'batting_team' column to see which teams are listed as batting teams



In [19]:
delivery_df['batting_team'].unique()

array(['Norway', 'Jersey', 'Namibia', 'Guernsey', 'Australia',
       'South Africa', 'Ghana', 'Kenya', 'Italy', 'Denmark', 'Nigeria',
       'New Zealand', 'Thailand', 'England', 'India', 'Malaysia',
       'Netherlands', 'West Indies', 'Bangladesh', 'Maldives', 'Nepal',
       'Kuwait', 'Afghanistan', 'Singapore', 'Sri Lanka', 'Bermuda',
       'Zimbabwe', 'Cayman Islands', 'Canada', 'United States of America',
       'Botswana', 'Ireland', 'Hong Kong', 'Scotland',
       'United Arab Emirates', 'Papua New Guinea', 'Oman', 'Pakistan',
       'Gibraltar', 'Vanuatu', 'Spain', 'Portugal', 'Bhutan', 'Germany',
       'Qatar', 'Iran', 'Belgium', 'Isle of Man', 'Romania', 'Bulgaria',
       'Philippines', 'Uganda'], dtype=object)

# List of cricket teams included in the dataset


In [20]:
teams =[
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka'
]

# Filter 'delivery_df' to include only rows where both 'batting_team' and 'bowling_team' are in the predefined list of teams


In [21]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(teams)]
delivery_df = delivery_df[delivery_df['bowling_team'].isin(teams)]
#

# Display the filtered DataFrame 'delivery_df' containing deliveries where both batting and bowling teams are in the predefined list of teams


In [22]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
491,6,Australia,0.1,DA Warner,DW Steyn,4,0,Johannesburg,The Wanderers Stadium,South Africa
492,6,Australia,0.2,DA Warner,DW Steyn,0,DA Warner,Johannesburg,The Wanderers Stadium,South Africa
493,6,Australia,0.3,AJ Finch,DW Steyn,4,0,Johannesburg,The Wanderers Stadium,South Africa
494,6,Australia,0.4,AJ Finch,DW Steyn,1,0,Johannesburg,The Wanderers Stadium,South Africa
495,6,Australia,0.5,SPD Smith,DW Steyn,0,0,Johannesburg,The Wanderers Stadium,South Africa
...,...,...,...,...,...,...,...,...,...,...
115425,964,South Africa,19.2,BE Hendricks,CJ Jordan,0,BE Hendricks,East London,Buffalo Park,England
115426,964,South Africa,19.3,DW Steyn,CJ Jordan,4,0,East London,Buffalo Park,England
115427,964,South Africa,19.4,DW Steyn,CJ Jordan,1,0,East London,Buffalo Park,England
115428,964,South Africa,19.5,AL Phehlukwayo,CJ Jordan,0,0,East London,Buffalo Park,England


# Create a new DataFrame 'op' with selected columns from 'delivery_df' to include relevant details for further analysis


In [23]:
op = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

# Display the 'op' DataFrame containing relevant columns for each delivery, including match details and performance metrics


In [24]:
op

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
491,6,Australia,South Africa,0.1,4,0,Johannesburg,The Wanderers Stadium
492,6,Australia,South Africa,0.2,0,DA Warner,Johannesburg,The Wanderers Stadium
493,6,Australia,South Africa,0.3,4,0,Johannesburg,The Wanderers Stadium
494,6,Australia,South Africa,0.4,1,0,Johannesburg,The Wanderers Stadium
495,6,Australia,South Africa,0.5,0,0,Johannesburg,The Wanderers Stadium
...,...,...,...,...,...,...,...,...
115425,964,South Africa,England,19.2,0,BE Hendricks,East London,Buffalo Park
115426,964,South Africa,England,19.3,4,0,East London,Buffalo Park
115427,964,South Africa,England,19.4,1,0,East London,Buffalo Park
115428,964,South Africa,England,19.5,0,0,East London,Buffalo Park


# Serialize and save the 'op' DataFrame to a file named 'delivery_df.pkl' using pickle for later use


In [25]:
pickle.dump(op,open('delivery_df.pkl','wb'))

# Display the 'op' DataFrame containing selected columns with delivery details and match information


In [26]:
op

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
491,6,Australia,South Africa,0.1,4,0,Johannesburg,The Wanderers Stadium
492,6,Australia,South Africa,0.2,0,DA Warner,Johannesburg,The Wanderers Stadium
493,6,Australia,South Africa,0.3,4,0,Johannesburg,The Wanderers Stadium
494,6,Australia,South Africa,0.4,1,0,Johannesburg,The Wanderers Stadium
495,6,Australia,South Africa,0.5,0,0,Johannesburg,The Wanderers Stadium
...,...,...,...,...,...,...,...,...
115425,964,South Africa,England,19.2,0,BE Hendricks,East London,Buffalo Park
115426,964,South Africa,England,19.3,4,0,East London,Buffalo Park
115427,964,South Africa,England,19.4,1,0,East London,Buffalo Park
115428,964,South Africa,England,19.5,0,0,East London,Buffalo Park


# Save the 'op' DataFrame to a CSV file named 'delivery_df.csv', excluding the index column


In [27]:
op.to_csv('delivery_df.csv', index=False)


# Display the 'op' DataFrame containing delivery-level data with match details, batting and bowling teams, and performance metrics


In [28]:
op

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
491,6,Australia,South Africa,0.1,4,0,Johannesburg,The Wanderers Stadium
492,6,Australia,South Africa,0.2,0,DA Warner,Johannesburg,The Wanderers Stadium
493,6,Australia,South Africa,0.3,4,0,Johannesburg,The Wanderers Stadium
494,6,Australia,South Africa,0.4,1,0,Johannesburg,The Wanderers Stadium
495,6,Australia,South Africa,0.5,0,0,Johannesburg,The Wanderers Stadium
...,...,...,...,...,...,...,...,...
115425,964,South Africa,England,19.2,0,BE Hendricks,East London,Buffalo Park
115426,964,South Africa,England,19.3,4,0,East London,Buffalo Park
115427,964,South Africa,England,19.4,1,0,East London,Buffalo Park
115428,964,South Africa,England,19.5,0,0,East London,Buffalo Park
