#### Import Libraries

In [133]:
import os
import pandas as pd
import zipfile
import json
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

#### Extract Data in folder

In [40]:
dir_path = 't20_json_files'

# Create storage directory
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# extract files from zip to directory
with zipfile.ZipFile('icc_mens_t20_world_cup_male_json.zip', 'r') as zip_ref:
        zip_ref.extractall(dir_path)

#### Json data into a Dataframe

In [42]:
final_df = pd.DataFrame()

for file in tqdm(os.listdir(dir_path)):
    if file.endswith('.json'):
        file_path = os.path.join(dir_path,file)
        with open(file_path, 'r') as json_file:
            json_data = json.load(json_file)
            df = pd.json_normalize(json_data)
            match_id = os.path.basename(file_path).replace('.json','') # create a match_id column where match_id = json filename 
            df.insert(loc = 0, column = 'match_id', value = match_id)  # assign match_id as the first column
            final_df = pd.concat([final_df,df],ignore_index=True) 

100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [00:13<00:00, 11.67it/s]


In [43]:
final_df.head()

Unnamed: 0,match_id,innings,meta.data_version,meta.created,meta.revision,info.balls_per_over,info.city,info.dates,info.event.name,info.event.match_number,...,info.registry.people.Shuvagata Hom,info.registry.people.Khalid Latif,info.registry.people.JM Vince,info.registry.people.LE Plunkett,info.registry.people.AM Phangiso,info.registry.people.HM Nicholls,info.registry.people.RSA Palliyaguruge,info.registry.people.VA Kulkarni,info.registry.people.RAS Lakmal,info.registry.people.MK Pandey
0,1273712,"[{'team': 'Papua New Guinea', 'overs': [{'over...",1.1.0,2021-10-18,2,6,Al Amarat,[2021-10-17],ICC Men's T20 World Cup,1.0,...,,,,,,,,,,
1,1273713,"[{'team': 'Scotland', 'overs': [{'over': 0, 'd...",1.1.0,2021-10-18,1,6,Al Amarat,[2021-10-17],ICC Men's T20 World Cup,2.0,...,,,,,,,,,,
2,1273714,"[{'team': 'Netherlands', 'overs': [{'over': 0,...",1.1.0,2021-10-18,2,6,Abu Dhabi,[2021-10-18],ICC Men's T20 World Cup,3.0,...,,,,,,,,,,
3,1273715,"[{'team': 'Namibia', 'overs': [{'over': 0, 'de...",1.1.0,2021-10-18,2,6,Abu Dhabi,[2021-10-18],ICC Men's T20 World Cup,4.0,...,,,,,,,,,,
4,1273716,"[{'team': 'Scotland', 'overs': [{'over': 0, 'd...",1.1.0,2021-10-19,2,6,Al Amarat,[2021-10-19],ICC Men's T20 World Cup,5.0,...,,,,,,,,,,


#### Data Cleaning

##### Keep only relevant/important columns

In [124]:
# Select only important columns
imp_columns = '''
match_id
info.dates
info.city
info.venue
info.event.name
info.event.match_number
info.teams
info.toss.decision
info.toss.winner
info.outcome.winner
info.outcome.by.runs
info.outcome.by.wickets
info.player_of_match
'''

# Split the text into a list using newline character as delimiter
imp_column_list = imp_columns.strip().split('\n')

final_df1 = final_df[imp_column_list]
final_df1 = pd.concat([final_df1,final_df[[col for col in final_df.columns if 'info.players' in col]]], axis = 1)
final_df1.head()

Unnamed: 0,match_id,info.dates,info.city,info.venue,info.event.name,info.event.match_number,info.teams,info.toss.decision,info.toss.winner,info.outcome.winner,...,info.players.West Indies,info.players.England,info.players.India,info.players.Pakistan,info.players.Afghanistan,info.players.New Zealand,info.players.United Arab Emirates,info.players.Zimbabwe,info.players.Hong Kong,info.players.Nepal
0,1273712,[2021-10-17],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,1.0,"[Papua New Guinea, Oman]",field,Oman,Oman,...,,,,,,,,,,
1,1273713,[2021-10-17],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,2.0,"[Scotland, Bangladesh]",field,Bangladesh,Scotland,...,,,,,,,,,,
2,1273714,[2021-10-18],Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,3.0,"[Netherlands, Ireland]",bat,Netherlands,Ireland,...,,,,,,,,,,
3,1273715,[2021-10-18],Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,4.0,"[Namibia, Sri Lanka]",field,Sri Lanka,Sri Lanka,...,,,,,,,,,,
4,1273716,[2021-10-19],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,5.0,"[Scotland, Papua New Guinea]",bat,Scotland,Scotland,...,,,,,,,,,,


##### Club 'info.players.team_name' values into team_1_players and team_2_players

In [125]:
# Create columns team_1_players & team_2_players
final_df1.insert(loc = len(final_df1.columns), column = 'team_1_players', value = None)
final_df1.insert(loc = len(final_df1.columns), column = 'team_2_players', value = None)

In [126]:
# Club the values into newly created columns
for i in range(len(final_df1)):
    #print(f'First_team : {temp_df['info.teams'][i][0]}') 
    #print(f'Second_team : {temp_df['info.teams'][i][1]}')
    for col in final_df1.columns:
        if final_df1['info.teams'][i][0] in col: # if first team in info.teams column found in any temp_df.column names
            final_df1['team_1_players'][i] = final_df1[col][i] # then extract the list from that column and store in team_1_players column
        if final_df1['info.teams'][i][1] in col:
            final_df1['team_2_players'][i] = final_df1[col][i]

final_df1 = final_df1.drop(columns = [col for col in final_df1.columns if 'info.players' in col],axis=1) # drop all team name columns
final_df1.head()

Unnamed: 0,match_id,info.dates,info.city,info.venue,info.event.name,info.event.match_number,info.teams,info.toss.decision,info.toss.winner,info.outcome.winner,info.outcome.by.runs,info.outcome.by.wickets,info.player_of_match,team_1_players,team_2_players
0,1273712,[2021-10-17],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,1.0,"[Papua New Guinea, Oman]",field,Oman,Oman,,10.0,[Zeeshan Maqsood],"[TP Ura, L Siaka, A Vala, CJA Amini, S Bau, N ...","[Aqib Ilyas, Jatinder Singh, Khawar Ali, Zeesh..."
1,1273713,[2021-10-17],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,2.0,"[Scotland, Bangladesh]",field,Bangladesh,Scotland,6.0,,[CN Greaves],"[HG Munsey, KJ Coetzer, MH Cross, RD Berringto...","[Liton Das, Soumya Sarkar, Shakib Al Hasan, Mu..."
2,1273714,[2021-10-18],Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,3.0,"[Netherlands, Ireland]",bat,Netherlands,Ireland,,7.0,[C Campher],"[MP O'Dowd, BN Cooper, BFW de Leede, CN Ackerm...","[PR Stirling, KJ O'Brien, A Balbirnie, GJ Dela..."
3,1273715,[2021-10-18],Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,4.0,"[Namibia, Sri Lanka]",field,Sri Lanka,Sri Lanka,,7.0,[M Theekshana],"[SJ Baard, ZE Green, CG Williams, MG Erasmus, ...","[P Nissanka, MDKJ Perera, LD Chandimal, WIA Fe..."
4,1273716,[2021-10-19],Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,5.0,"[Scotland, Papua New Guinea]",bat,Scotland,Scotland,17.0,,[RD Berrington],"[HG Munsey, KJ Coetzer, MH Cross, RD Berringto...","[TP Ura, L Siaka, A Vala, CJA Amini, S Bau, SK..."


##### Sort the matches in ascending order of info.dates

In [138]:
# convert list to datetime
final_df1['info.dates'] = pd.to_datetime(final_df1['info.dates'].str[0],format='%Y-%m-%d').dt.date
final_df1 = final_df1.sort_values(by = 'info.dates')
final_df1

Unnamed: 0,match_id,info.dates,info.city,info.venue,info.event.name,info.event.match_number,info.teams,info.toss.decision,info.toss.winner,info.outcome.winner,info.outcome.by.runs,info.outcome.by.wickets,info.player_of_match,team_1_players,team_2_players
0,1273712,2021-10-19,Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,1.0,"[Papua New Guinea, Oman]",field,Oman,Oman,,10.0,[Zeeshan Maqsood],"[TP Ura, L Siaka, A Vala, CJA Amini, S Bau, N ...","[Aqib Ilyas, Jatinder Singh, Khawar Ali, Zeesh..."
1,1273713,2021-10-17,Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,2.0,"[Scotland, Bangladesh]",field,Bangladesh,Scotland,6.0,,[CN Greaves],"[HG Munsey, KJ Coetzer, MH Cross, RD Berringto...","[Liton Das, Soumya Sarkar, Shakib Al Hasan, Mu..."
2,1273714,2021-10-18,Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,3.0,"[Netherlands, Ireland]",bat,Netherlands,Ireland,,7.0,[C Campher],"[MP O'Dowd, BN Cooper, BFW de Leede, CN Ackerm...","[PR Stirling, KJ O'Brien, A Balbirnie, GJ Dela..."
3,1273715,2021-10-18,Abu Dhabi,"Zayed Cricket Stadium, Abu Dhabi",ICC Men's T20 World Cup,4.0,"[Namibia, Sri Lanka]",field,Sri Lanka,Sri Lanka,,7.0,[M Theekshana],"[SJ Baard, ZE Green, CG Williams, MG Erasmus, ...","[P Nissanka, MDKJ Perera, LD Chandimal, WIA Fe..."
4,1273716,2021-10-19,Al Amarat,Al Amerat Cricket Ground Oman Cricket (Ministr...,ICC Men's T20 World Cup,5.0,"[Scotland, Papua New Guinea]",bat,Scotland,Scotland,17.0,,[RD Berrington],"[HG Munsey, KJ Coetzer, MH Cross, RD Berringto...","[TP Ura, L Siaka, A Vala, CJA Amini, S Bau, SK..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,951365,2016-03-27,Nagpur,"Vidarbha Cricket Association Stadium, Jamtha",World T20,30.0,"[Afghanistan, West Indies]",field,West Indies,Afghanistan,6.0,,[Najibullah Zadran],"[Mohammad Shahzad, Usman Ghani, Asghar Stanikz...","[J Charles, E Lewis, ADS Fletcher, MN Samuels,..."
150,951367,2016-03-28,Delhi,Feroz Shah Kotla,World T20,32.0,"[South Africa, Sri Lanka]",field,South Africa,South Africa,,8.0,[AM Phangiso],"[HM Amla, Q de Kock, F du Plessis, AB de Villi...","[LD Chandimal, TM Dilshan, HDRL Thirimanne, TA..."
151,951369,2016-03-30,Delhi,Feroz Shah Kotla,World T20,,"[England, New Zealand]",field,England,England,,7.0,[JJ Roy],"[JJ Roy, AD Hales, JE Root, EJG Morgan, JC But...","[MJ Guptill, KS Williamson, C Munro, CJ Anders..."
152,951371,2016-03-31,Mumbai,Wankhede Stadium,World T20,,"[India, West Indies]",field,West Indies,West Indies,,7.0,[LMP Simmons],"[RG Sharma, AM Rahane, V Kohli, MS Dhoni, SK R...","[J Charles, CH Gayle, MN Samuels, LMP Simmons,..."


In [140]:
final_df1.to_csv('sample_output.csv',index = False)