### Match-Level Extraction from the Cricsheet jsons

In [1]:
import os
import json
import pandas as pd
from pathlib import Path

In [2]:
project_root = Path(os.getcwd()).parent
os.chdir(project_root)
print("Working directory:", os.getcwd())

Working directory: c:\Users\raees\Documents\t20-worldcup-insight-engine\t20-worldcup-insight-engine


In [3]:
t20i_path = Path("data/raw/t20_all_matches")
wc_path   = Path("data/raw/t20_worldcup_matches")

print("T20I folder exists:", t20i_path.exists())
print("WC folder exists  :", wc_path.exists())

T20I folder exists: True
WC folder exists  : True


In [4]:
def extract_match_info(json_path, is_world_cup=False):
    """
    Extract match-level metadata from a Cricsheet JSON file.
    Returns a dictionary (1 row).
    """
    with open(json_path, "r", encoding="utf-8") as f:
        match = json.load(f)
    
    info = match.get("info", {})

    teams = info.get("teams", [])
    
    # Defensive checks
    team_1 = teams[0] if len(teams) > 0 else None
    team_2 = teams[1] if len(teams) > 1 else None

    outcome = info.get("outcome", {})
    winner = outcome.get("winner", None)

    event = info.get("event", {})
    competition = event.get("name", None)

    return {
        "match_id": json_path.stem,
        "date": info.get("dates", [None])[0],
        "team_1": team_1,
        "team_2": team_2,
        "venue": info.get("venue", None),
        "winner": winner,
        "competition": competition,
        "is_world_cup": is_world_cup
    }


In [5]:
rows = []

# T20 Internationals
for file in t20i_path.glob("*.json"):
    try:
        rows.append(extract_match_info(file, is_world_cup=False))
    except Exception as e:
        print(f"Error processing {file.name}: {e}")

# T20 World Cup
for file in wc_path.glob("*.json"):
    try:
        rows.append(extract_match_info(file, is_world_cup=True))
    except Exception as e:
        print(f"Error processing {file.name}: {e}")

print(f"Total matches extracted: {len(rows)}")


Total matches extracted: 3299


In [6]:
matches_df = pd.DataFrame(rows)
matches_df.head()


Unnamed: 0,match_id,date,team_1,team_2,venue,winner,competition,is_world_cup
0,1001349,2017-02-17,Australia,Sri Lanka,Melbourne Cricket Ground,Sri Lanka,Sri Lanka in Australia T20I Series,False
1,1001351,2017-02-19,Australia,Sri Lanka,"Simonds Stadium, South Geelong",Sri Lanka,Sri Lanka in Australia T20I Series,False
2,1001353,2017-02-22,Australia,Sri Lanka,Adelaide Oval,Australia,Sri Lanka in Australia T20I Series,False
3,1004729,2016-09-05,Ireland,Hong Kong,"Bready Cricket Club, Magheramason",Hong Kong,Hong Kong in Ireland T20I Series,False
4,1007655,2016-06-18,Zimbabwe,India,Harare Sports Club,Zimbabwe,India in Zimbabwe T20I Series,False


In [7]:
# basic checks 
print("Shape:", matches_df.shape)
print("\nMissing values:")
print(matches_df.isnull().sum())

print("\nUnique teams:", 
      pd.unique(matches_df[['team_1', 'team_2']].values.ravel()).shape[0])

print("\nWorld Cup matches:", matches_df['is_world_cup'].sum())


Shape: (3299, 8)

Missing values:
match_id          0
date              0
team_1            0
team_2            0
venue             0
winner          114
competition      62
is_world_cup      0
dtype: int64

Unique teams: 107

World Cup matches: 181


In [8]:
# remove matches without results 
matches_df = matches_df.dropna(subset=["winner"])
print("After removing no-result matches:", matches_df.shape)


After removing no-result matches: (3185, 8)


In [9]:
# save processed dataset
output_path = Path("data/processed/match_level.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

matches_df.to_csv(output_path, index=False)
print(f"Saved match-level data to: {output_path}")


Saved match-level data to: data\processed\match_level.csv
