In [4]:
# Day 1: Sanity Check for T20 JSON Files
# This notebook runs safely from the "notebooks" folder
# Loads ONE JSON from each folder to check keys and metadata
# No freezing, no StopIteration

import os
import json
from pathlib import Path

# ------------------------------
# 1. Set working directory to project root
# ------------------------------

# The notebook is inside 'notebooks', so project root is one level up
project_root = Path(os.getcwd()).parent
os.chdir(project_root)
print("Working directory set to project root:", os.getcwd())

# ------------------------------
# 2. Define paths to raw data folders
# ------------------------------
t20i_path = Path("data/raw/t20_all_matches")
wc_path   = Path("data/raw/t20_worldcup_matches")

# ------------------------------
# 3. Check if folders exist
# ------------------------------
for folder in [t20i_path, wc_path]:
    if folder.exists() and folder.is_dir():
        print(f"✅ Folder exists: {folder}")
    else:
        print(f"❌ Folder NOT found: {folder}")
        print("Make sure ZIP files are extracted correctly.\n")

# ------------------------------
# 4. List files in each folder
# ------------------------------
print("\nListing contents of folders:")
print("T20I folder contents:", [f.name for f in t20i_path.iterdir()] if t20i_path.exists() else [])
print("WC folder contents  :", [f.name for f in wc_path.iterdir()] if wc_path.exists() else [])

# ------------------------------
# 5. Pick ONE JSON file safely from each folder
# ------------------------------
t20i_file = next(t20i_path.glob("*.json"), None)
wc_file   = next(wc_path.glob("*.json"), None)

if not t20i_file:
    print("\n❌ No JSON files found in T20I folder.")
if not wc_file:
    print("\n❌ No JSON files found in WC folder.")

# ------------------------------
# 6. Load and inspect JSON files
# ------------------------------
if t20i_file and wc_file:
    print("\nFiles selected for sanity check:")
    print("T20I:", t20i_file)
    print("WC  :", wc_file)

    # Load T20I match
    with open(t20i_file, "r", encoding="utf-8") as f:
        t20i_match = json.load(f)

    # Load WC match
    with open(wc_file, "r", encoding="utf-8") as f:
        wc_match = json.load(f)

    # ------------------------------
    # 7. Print top-level keys
    # ------------------------------
    print("\nT20I JSON top-level keys:", t20i_match.keys())
    print("WC JSON top-level keys  :", wc_match.keys())

    # ------------------------------
    # 8. Peek at match info (metadata only)
    # ------------------------------
    print("\nSample T20I match info keys:", t20i_match['info'].keys())
    print("Teams:", t20i_match['info']['teams'])
    print("Venue:", t20i_match['info'].get('venue', 'Unknown'))
    print("Date :", t20i_match['info'].get('dates', 'Unknown'))

    print("\nSample WC match info keys:", wc_match['info'].keys())
    print("Teams:", wc_match['info']['teams'])
    print("Venue:", wc_match['info'].get('venue', 'Unknown'))
    print("Date :", wc_match['info'].get('dates', 'Unknown'))

else:
    print("\n❌ Sanity check cannot run because one or both folders have no JSON files.")


Working directory set to project root: c:\Users\raees\Documents\t20-worldcup-insight-engine\t20-worldcup-insight-engine
✅ Folder exists: data\raw\t20_all_matches
✅ Folder exists: data\raw\t20_worldcup_matches

Listing contents of folders:
T20I folder contents: ['1001349.json', '1001351.json', '1001353.json', '1004729.json', '1007655.json', '1007657.json', '1007659.json', '1019979.json', '1019981.json', '1019983.json', '1020029.json', '1031431.json', '1031433.json', '1031435.json', '1031665.json', '1034825.json', '1034827.json', '1034829.json', '1041615.json', '1041617.json', '1050217.json', '1050219.json', '1050221.json', '1072316.json', '1072317.json', '1072318.json', '1072319.json', '1072320.json', '1072321.json', '1072322.json', '1074957.json', '1074959.json', '1074961.json', '1074964.json', '1074965.json', '1074966.json', '1074968.json', '1074970.json', '1075507.json', '1075508.json', '1077947.json', '1077948.json', '1083449.json', '1083450.json', '1085495.json', '1085496.json', '1