<a href="https://colab.research.google.com/github/SHAIK-MOHAMMAD-IRFAN27/DLS_USING_IRL/blob/main/T20_DATA_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import json
import pandas as pd
import os
from glob import glob

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
json_folder = "/content/drive/MyDrive/IPL"
output_csv = "cricket_matches.csv"
new= "/content/drive/MyDrive/IPL/newIPL/IPL"

In [12]:
columns = [
    "match_id", "inning", "team", "over", "ball_in_over", "delivery_index", "extras",
    "score_band", "wickets_lost", "is_legal", "runs_batsman", "is_wicket",
    "batsman", "non_striker", "bowler", "cumulative_runs"
]

In [13]:
import json
import csv
import os
from google.colab import files
from pathlib import Path

def extract_cricket_data(json_data):
    """Extract cricket match data rows from JSON data with match phases"""
    # Extract match ID from metadata
    match_id = json_data.get('info', {}).get('event', {}).get('match_number', '')

    # Prepare rows for CSV
    csv_rows = []
    delivery_index = 0

    # Process each innings
    for inning_index, inning in enumerate(json_data.get('innings', [])):
        team = inning['team']
        inning_number = inning_index + 1
        wickets_lost = 0
        cumulative_runs = 0
        score_band = 0  # Increments based on score milestones

        # Process each over
        for over in inning.get('overs', []):
            over_number = over['over']

            # Determine match phase based on over number
            if over_number < 6:
                match_phase = 'Powerplay (0-6)'
            elif 6 <= over_number < 16:
                match_phase = 'Middle Overs (6-16)'
            else:
                match_phase = 'Death Overs (16-20)'

            # Process each delivery
            for ball_index, delivery in enumerate(over.get('deliveries', [])):
                ball_in_over = ball_index + 1

                # Extract delivery info
                batter = delivery.get('batter', '')
                bowler = delivery.get('bowler', '')
                non_striker = delivery.get('non_striker', '')

                # Extract runs info
                runs_info = delivery.get('runs', {})
                runs_batsman = runs_info.get('batter', 0)
                extras = runs_info.get('extras', 0)
                total_runs = runs_info.get('total', 0)

                # Update cumulative runs
                cumulative_runs += total_runs

                # Update score band (increments with every 50 runs)
                if cumulative_runs // 50 > score_band:
                    score_band = cumulative_runs // 50

                # Check if wicket
                is_wicket = 1 if 'wicket' in delivery else 0
                if is_wicket:
                    wickets_lost += 1

                # Create row for this delivery
                row = {
                    'match_id': match_id,
                    'inning': inning_number,
                    'team': team,
                    'over': over_number,
                    'ball_in_over': ball_in_over,
                    'delivery_index': delivery_index,
                    'extras': extras,
                    'score_band': score_band,
                    'wickets_lost': wickets_lost,
                    'is_legal': 1,  # Assuming all deliveries are legal
                    'runs_batsman': runs_batsman,
                    'is_wicket': is_wicket,
                    'batsman': batter,
                    'non_striker': non_striker,
                    'bowler': bowler,
                    'cumulative_runs': cumulative_runs,
                    'match_phase': match_phase  # New field for match phase
                }

                csv_rows.append(row)
                delivery_index += 1

    return csv_rows

def process_cricket_data(input_path=None, output_csv=None):
    """Process cricket match data with match phases and combine into CSV"""
    # Define default output file if not specified
    if not output_csv:
        output_csv = "combined_cricket_matches_with_phases.csv"

    # Prepare to collect rows from all matches
    all_csv_rows = []
    processed_files = 0

    # Define CSV headers (including new match_phase field)
    headers = [
        'match_id', 'inning', 'team', 'over', 'ball_in_over', 'delivery_index',
        'extras', 'score_band', 'wickets_lost', 'is_legal', 'runs_batsman',
        'is_wicket', 'batsman', 'non_striker', 'bowler', 'cumulative_runs',
        'match_phase'
    ]

    if input_path is None:
        # Handle file upload in Colab
        uploaded = files.upload()
        for filename in uploaded.keys():
            try:
                match_data = json.loads(uploaded[filename].decode('utf-8'))
                match_rows = extract_cricket_data(match_data)
                all_csv_rows.extend(match_rows)
                processed_files += 1
                print(f"Processed: {filename} - {len(match_rows)} deliveries")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    elif os.path.isdir(input_path):
        # Process all JSON files in directory
        for json_file in Path(input_path).glob('*.json'):
            try:
                with open(json_file, 'r') as f:
                    match_data = json.load(f)
                match_rows = extract_cricket_data(match_data)
                all_csv_rows.extend(match_rows)
                processed_files += 1
                print(f"Processed: {json_file} - {len(match_rows)} deliveries")
            except Exception as e:
                print(f"Error processing {json_file}: {str(e)}")
    else:
        # Process single JSON file
        try:
            with open(input_path, 'r') as f:
                match_data = json.load(f)
            match_rows = extract_cricket_data(match_data)
            all_csv_rows.extend(match_rows)
            processed_files += 1
            print(f"Processed: {input_path} - {len(match_rows)} deliveries")
        except Exception as e:
            print(f"Error processing {input_path}: {str(e)}")

    # Write all collected rows to a single CSV file
    with open(output_csv, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(all_csv_rows)

    print(f"All {processed_files} matches combined into: {output_csv}")
    print(f"Total deliveries: {len(all_csv_rows)}")

    # Provide download link in Colab
    files.download(output_csv)

    return output_csv

# Example usage in Colab
if __name__ == "__main__":
    print("Processing cricket data with match phases...")
    process_cricket_data()  # Without arguments, it will prompt for file upload

Processing cricket data with match phases...


Saving 335982.json to 335982.json
Saving 335983.json to 335983.json
Saving 335984.json to 335984.json
Saving 335985.json to 335985.json
Saving 335986.json to 335986.json
Saving 335987.json to 335987.json
Saving 335988.json to 335988.json
Saving 335989.json to 335989.json
Saving 335990.json to 335990.json
Saving 335991.json to 335991.json
Saving 335992.json to 335992.json
Saving 335993.json to 335993.json
Saving 335994.json to 335994.json
Saving 335995.json to 335995.json
Saving 335996.json to 335996.json
Saving 335997.json to 335997.json
Saving 335998.json to 335998.json
Saving 335999.json to 335999.json
Saving 336000.json to 336000.json
Saving 336001.json to 336001.json
Saving 336002.json to 336002.json
Saving 336003.json to 336003.json
Saving 336004.json to 336004.json
Saving 336005.json to 336005.json
Saving 336006.json to 336006.json
Saving 336007.json to 336007.json
Saving 336008.json to 336008.json
Saving 336009.json to 336009.json
Saving 336010.json to 336010.json
Saving 336011.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>