In [None]:
# ============================================
# WPL MATCH PREDICTION - DAY 1
# Data Exploration and Parsing
# ============================================

import json
import pandas as pd
import numpy as np
from datetime import datetime
import os
from google.colab import files

print("üèè WPL Match Prediction System - Day 1")
print("=" * 50)

üèè WPL Match Prediction System - Day 1


In [None]:
# Upload the WPL JSON zip file
print("üìÅ Please upload your wpl_json.zip file...")
uploaded = files.upload()

# Extract the zip file
import zipfile

zip_filename = list(uploaded.keys())[0]
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall('wpl_data')

print(f"‚úÖ Extracted files to 'wpl_data' folder")
print(f"üìä Total files: {len(os.listdir('wpl_data'))}")

üìÅ Please upload your wpl_json.zip file...


Saving wpl_json.zip to wpl_json.zip
‚úÖ Extracted files to 'wpl_data' folder
üìä Total files: 67


In [None]:
def parse_match_basic(file_path):
    """
    Parse a single WPL match JSON file
    Extract basic match information
    """
    with open(file_path, 'r') as f:
        data = json.load(f)

    info = data['info']

    # Basic match info
    match_dict = {
        'date': info['dates'][0],
        'venue': info['venue'],
        'city': info.get('city', 'Unknown'),
        'team1': info['teams'][0],
        'team2': info['teams'][1],
        'toss_winner': info['toss']['winner'],
        'toss_decision': info['toss']['decision'],
        'match_number': info['event'].get('match_number', 0),
        'season': info.get('season', 'Unknown')
    }

    # Outcome
    outcome = info.get('outcome', {})
    if 'winner' in outcome:
        match_dict['winner'] = outcome['winner']
        match_dict['result'] = 'completed'

        # Win margin
        if 'by' in outcome:
            if 'runs' in outcome['by']:
                match_dict['win_margin'] = outcome['by']['runs']
                match_dict['win_type'] = 'runs'
            elif 'wickets' in outcome['by']:
                match_dict['win_margin'] = outcome['by']['wickets']
                match_dict['win_type'] = 'wickets'
    else:
        match_dict['winner'] = None
        match_dict['result'] = outcome.get('result', 'unknown')
        match_dict['win_margin'] = None
        match_dict['win_type'] = None

    # Player of the match
    match_dict['player_of_match'] = info.get('player_of_match', [None])[0]

    # Calculate innings scores
    innings = data.get('innings', [])

    if len(innings) >= 1:
        # First innings score
        first_innings = innings[0]
        first_innings_runs = 0
        first_innings_wickets = 0

        for over in first_innings.get('overs', []):
            for delivery in over.get('deliveries', []):
                first_innings_runs += delivery['runs']['total']
                if 'wickets' in delivery:
                    first_innings_wickets += len(delivery['wickets'])

        match_dict['innings1_team'] = first_innings['team']
        match_dict['innings1_runs'] = first_innings_runs
        match_dict['innings1_wickets'] = first_innings_wickets

    if len(innings) >= 2:
        # Second innings score
        second_innings = innings[1]
        second_innings_runs = 0
        second_innings_wickets = 0

        for over in second_innings.get('overs', []):
            for delivery in over.get('deliveries', []):
                second_innings_runs += delivery['runs']['total']
                if 'wickets' in delivery:
                    second_innings_wickets += len(delivery['wickets'])

        match_dict['innings2_team'] = second_innings['team']
        match_dict['innings2_runs'] = second_innings_runs
        match_dict['innings2_wickets'] = second_innings_wickets

    return match_dict

# Test with your sample file
print("\nüß™ Testing parser with sample match...")
sample_match = parse_match_basic('wpl_data/1358929.json')

print("\nüìã Parsed Match Information:")
print("-" * 50)
for key, value in sample_match.items():
    print(f"{key:20s}: {value}")


üß™ Testing parser with sample match...

üìã Parsed Match Information:
--------------------------------------------------
date                : 2023-03-04
venue               : Dr DY Patil Sports Academy, Mumbai
city                : Navi Mumbai
team1               : Mumbai Indians
team2               : Gujarat Giants
toss_winner         : Gujarat Giants
toss_decision       : field
match_number        : 1
season              : 2022/23
winner              : Mumbai Indians
result              : completed
win_margin          : 143
win_type            : runs
player_of_match     : H Kaur
innings1_team       : Mumbai Indians
innings1_runs       : 207
innings1_wickets    : 5
innings2_team       : Gujarat Giants
innings2_runs       : 64
innings2_wickets    : 10


In [None]:
def create_complete_dataset(folder_path):
    """
    Parse all WPL JSON files and create a complete dataset
    """
    all_matches = []

    # Get all JSON files
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    print(f"\nüìä Processing {len(json_files)} matches...")

    for i, filename in enumerate(json_files, 1):
        try:
            file_path = os.path.join(folder_path, filename)
            match_data = parse_match_basic(file_path)
            match_data['match_id'] = filename.replace('.json', '')
            all_matches.append(match_data)

            if i % 10 == 0:
                print(f"  ‚úì Processed {i}/{len(json_files)} matches")
        except Exception as e:
            print(f"  ‚úó Error processing {filename}: {e}")

    # Create DataFrame
    df = pd.DataFrame(all_matches)

    # Sort by date
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)

    return df

# Create the full dataset
df = create_complete_dataset('wpl_data')

print(f"\n‚úÖ Dataset created successfully!")
print(f"üìä Total matches: {len(df)}")
print(f"üìÖ Date range: {df['date'].min().date()} to {df['date'].max().date()}")


üìä Processing 66 matches...
  ‚úì Processed 10/66 matches
  ‚úì Processed 20/66 matches
  ‚úì Processed 30/66 matches
  ‚úì Processed 40/66 matches
  ‚úì Processed 50/66 matches
  ‚úì Processed 60/66 matches

‚úÖ Dataset created successfully!
üìä Total matches: 66
üìÖ Date range: 2023-03-04 to 2025-03-15


In [None]:
print("\n" + "="*50)
print("üìä DATASET OVERVIEW")
print("="*50)

# Display first few matches
print("\nüîç First 5 matches:")
print(df[['date', 'team1', 'team2', 'winner', 'venue']].head())

# Display last few matches (most recent)
print("\nüîç Last 5 matches (Most Recent):")
print(df[['date', 'team1', 'team2', 'winner', 'venue']].tail())

# Basic statistics
print("\nüìà BASIC STATISTICS:")
print("-" * 50)

print(f"\nüèüÔ∏è  Unique Venues: {df['venue'].nunique()}")
print(df['venue'].value_counts())

print(f"\nüèè Teams:")
all_teams = pd.concat([df['team1'], df['team2']]).unique()
for team in sorted(all_teams):
    print(f"  ‚Ä¢ {team}")

print(f"\nüèÜ Wins by Team:")
print(df['winner'].value_counts())

print(f"\nüé≤ Toss Decision:")
print(df['toss_decision'].value_counts())

print(f"\nüìä Average Scores:")
print(f"  First Innings:  {df['innings1_runs'].mean():.1f} runs")
print(f"  Second Innings: {df['innings2_runs'].mean():.1f} runs")


üìä DATASET OVERVIEW

üîç First 5 matches:
        date                        team1                        team2  \
0 2023-03-04               Mumbai Indians               Gujarat Giants   
1 2023-03-05               Delhi Capitals  Royal Challengers Bangalore   
2 2023-03-05               Gujarat Giants                  UP Warriorz   
3 2023-03-06  Royal Challengers Bangalore               Mumbai Indians   
4 2023-03-07               Delhi Capitals                  UP Warriorz   

           winner                               venue  
0  Mumbai Indians  Dr DY Patil Sports Academy, Mumbai  
1  Delhi Capitals           Brabourne Stadium, Mumbai  
2     UP Warriorz  Dr DY Patil Sports Academy, Mumbai  
3  Mumbai Indians           Brabourne Stadium, Mumbai  
4  Delhi Capitals  Dr DY Patil Sports Academy, Mumbai  

üîç Last 5 matches (Most Recent):
         date                        team1                        team2  \
61 2025-03-08                  UP Warriorz  Royal Challengers 

In [None]:
print("\n" + "="*50)
print("üéØ TOSS IMPACT ANALYSIS")
print("="*50)

# Did toss winner also win the match?
df['toss_winner_won'] = (df['toss_winner'] == df['winner']).astype(int)

toss_win_rate = df['toss_winner_won'].mean() * 100
print(f"\nüé≤ Toss winner also won match: {toss_win_rate:.1f}% of the time")

# Toss decision impact
print("\nüìä Toss Decision Impact:")
toss_analysis = df.groupby('toss_decision').agg({
    'toss_winner_won': 'mean',
    'match_id': 'count'
}).round(3)
toss_analysis.columns = ['Win Rate', 'Count']
toss_analysis['Win Rate'] = (toss_analysis['Win Rate'] * 100).round(1)
print(toss_analysis)


üéØ TOSS IMPACT ANALYSIS

üé≤ Toss winner also won match: 53.0% of the time

üìä Toss Decision Impact:
               Win Rate  Count
toss_decision                 
bat                36.8     19
field              59.6     47


In [None]:
print("\n" + "="*50)
print("üèüÔ∏è  VENUE ANALYSIS")
print("="*50)

# For each venue, calculate batting first vs chasing success
venue_stats = []

for venue in df['venue'].unique():
    venue_matches = df[df['venue'] == venue]

    # Matches where team batting first won
    bat_first_won = 0
    chase_won = 0

    for _, match in venue_matches.iterrows():
        if match['winner'] == match['innings1_team']:
            bat_first_won += 1
        elif match['winner'] == match['innings2_team']:
            chase_won += 1

    total = bat_first_won + chase_won
    if total > 0:
        venue_stats.append({
            'venue': venue,
            'total_matches': len(venue_matches),
            'bat_first_won': bat_first_won,
            'chase_won': chase_won,
            'bat_first_win_pct': (bat_first_won / total * 100) if total > 0 else 0,
            'avg_score': venue_matches['innings1_runs'].mean()
        })

venue_df = pd.DataFrame(venue_stats).sort_values('total_matches', ascending=False)
print("\nüìç Venue Statistics:")
print(venue_df.to_string(index=False))


üèüÔ∏è  VENUE ANALYSIS

üìç Venue Statistics:
                                                                venue  total_matches  bat_first_won  chase_won  bat_first_win_pct  avg_score
                                     M Chinnaswamy Stadium, Bengaluru             19              5         13          27.777778 150.368421
                                            Brabourne Stadium, Mumbai             15              8          7          53.333333 170.666667
                                   Dr DY Patil Sports Academy, Mumbai             11              3          8          27.272727 147.727273
                                          Arun Jaitley Stadium, Delhi             11              7          4          63.636364 154.454545
                                            Kotambi Stadium, Vadodara              6              0          6           0.000000 155.833333
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow              4              2      