In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

## Load and Combine Datasets

### Subtask:
Load 'ipl_2025_deliveries.csv', 'ipl_2024_deliveries.csv', 'ipl_2023_deliveries.csv', and 'ipl_2022_deliveries.csv' into separate DataFrames and then concatenate them into a single DataFrame. If any file is not found, it should print an error message but continue with the found files.


In [3]:
import os

drive_path = 'D:\\VS Codes\\Projects\\IPL-Score-Prediction-using-Deep-Learning-\\data\\raw'

filenames = [
    'ipl_2025_deliveries.csv',
    'ipl_2024_deliveries.csv',
    'ipl_2023_deliveries.csv',
    'ipl_2022_deliveries.csv'
]

dataframes_to_combine = []

# Ensure the directory exists before trying to read files
if not os.path.exists(drive_path):
    print(f"❌ ERROR: Directory '{drive_path}' not found. Please check the path and ensure files are in Google Drive.")
else:
    for filename in filenames:
        file_path = os.path.join(drive_path, filename)
        try:
            df_temp = pd.read_csv(file_path)
            dataframes_to_combine.append(df_temp)
            print(f"✅ Dataset '{filename}' loaded successfully! Shape: {df_temp.shape}")
        except FileNotFoundError:
            print(f"❌ ERROR: File '{file_path}' not found. Skipping this file.")
        except Exception as e:
            print(f"❌ ERROR: An unexpected error occurred while loading '{file_path}': {e}")

if dataframes_to_combine:
    combined_df = pd.concat(dataframes_to_combine, ignore_index=True)
    print("\n✅ All available datasets combined successfully!")
    print(f"Combined DataFrame shape: {combined_df.shape}")
    print("First 5 rows of the combined DataFrame:")
    print(combined_df.head())
else:
    print("\nNo datasets were loaded to combine.")

✅ Dataset 'ipl_2025_deliveries.csv' loaded successfully! Shape: (17246, 21)
✅ Dataset 'ipl_2024_deliveries.csv' loaded successfully! Shape: (17053, 20)
✅ Dataset 'ipl_2023_deliveries.csv' loaded successfully! Shape: (17386, 20)
✅ Dataset 'ipl_2022_deliveries.csv' loaded successfully! Shape: (17912, 20)

✅ All available datasets combined successfully!
Combined DataFrame shape: (69597, 21)
First 5 rows of the combined DataFrame:
   match_id  season        phase  match_no          date  \
0    202501    2025  Group Stage         1  Mar 22, 2025   
1    202501    2025  Group Stage         1  Mar 22, 2025   
2    202501    2025  Group Stage         1  Mar 22, 2025   
3    202501    2025  Group Stage         1  Mar 22, 2025   
4    202501    2025  Group Stage         1  Mar 22, 2025   

                   venue batting_team bowling_team  innings  over  ...  \
0  Eden Gardens, Kolkata          KKR          RCB        1   0.1  ...   
1  Eden Gardens, Kolkata          KKR          RCB        1 

In [4]:
df = combined_df.copy()
df = df[df['innings'].isin([1, 2])].copy()

# Calculate 'total_runs_delivery' (runs off bat + extras) if not already present
# Most Kaggle datasets have 'runs_of_bat' and 'extras'
if 'total_runs' not in df.columns:
    df['total_runs'] = df['runs_of_bat'] + df['extras']

In [5]:
match_scores = df.groupby(['match_id', 'innings'])['total_runs'].sum().reset_index()
match_scores.rename(columns={'total_runs': 'final_total_score'}, inplace=True)

# Merge this back into the main dataframe
# Now every ball knows what the final score of that inning eventually became.
df = df.merge(match_scores, on=['match_id', 'innings'], how='left')

In [6]:
# We need "Current State" features:
# 1. Current Score (Cumulative Sum of runs)
# 2. Wickets Fallen (Cumulative Count of dismissals)
# 3. Balls Bowled (To calculate Overs)

# Sort strictly by Match -> Innings -> Ball Number to ensure cumulative sum works
df.sort_values(['match_id', 'innings', 'over'], inplace=True)

# Cumulative Runs
df['current_score'] = df.groupby(['match_id', 'innings'])['total_runs'].cumsum()

# Cumulative Wickets
# First, create a binary column: 1 if wicket fell, 0 if not
df['is_wicket'] = df['player_dismissed'].apply(lambda x: 1 if pd.notnull(x) else 0)
df['wickets_fallen'] = df.groupby(['match_id', 'innings'])['is_wicket'].cumsum()

# Calculate Overs Completed (e.g., Ball 0.6 is 1.0 overs)
# 'over' usually comes as 0.1, 0.2 ... 0.6. We can directly use this for 'overs'.
df['overs'] = df['over']

In [7]:
# Neural networks need numbers, not names like "CSK"
le_teams = LabelEncoder()
le_venue = LabelEncoder()

# Fit on all teams (batting and bowling)
all_teams = pd.concat([df['batting_team'], df['bowling_team']]).unique()
le_teams.fit(all_teams)

df['batting_team_encoded'] = le_teams.transform(df['batting_team'])
df['bowling_team_encoded'] = le_teams.transform(df['bowling_team'])

# Handle Venues
# Fill missing venues just in case
df['venue'] = df['venue'].fillna('Unknown')
df['venue_encoded'] = le_venue.fit_transform(df['venue'])

In [8]:
# We keep only what the model needs
features = ['batting_team_encoded', 'bowling_team_encoded', 'venue_encoded',
            'current_score', 'wickets_fallen', 'overs']
target = ['final_total_score']

# Drop rows where final score is missing (rare errors)
model_data = df[features + target].dropna()

In [10]:
# Convert to Numpy Arrays first
X_raw = model_data[features].values
y_raw = model_data[target].values

# Scale the data (CRITICAL for Neural Networks!)
# We scale Inputs (X) to be roughly between 0 and 1
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X_raw)

# We usually don't scale Y for regression unless values are huge,
# but for MDN, keeping Y in raw runs (e.g. 150, 200) is interpretable.

print("\n✅ Data Cleaning Complete!")
print(f"Training Features shape: {X_scaled.shape}")
print(f"Target Labels shape: {y_raw.shape}")
print("\nExample of Data (First 5 rows):")
print(model_data.head())

# Save the processed dataset to `data/processed`
import os
processed_dir = r'D:\\VS Codes\\Projects\\IPL-Score-Prediction-using-Deep-Learning-\\data\\processed'
os.makedirs(processed_dir, exist_ok=True)
out_path = os.path.join(processed_dir, 'model_data_processed.csv')
model_data.to_csv(out_path, index=False)
print(f"Saved processed dataset to: {out_path}")


✅ Data Cleaning Complete!
Training Features shape: (69587, 6)
Target Labels shape: (69587, 1)

Example of Data (First 5 rows):
       batting_team_encoded  bowling_team_encoded  venue_encoded  \
51675                     0                     3             17   
51676                     0                     3             17   
51677                     0                     3             17   
51678                     0                     3             17   
51679                     0                     3             17   

       current_score  wickets_fallen  overs  final_total_score  
51675              1               0    0.1                131  
51676              1               0    0.1                131  
51677              2               0    0.2                131  
51678              2               0    0.2                131  
51679              2               1    0.3                131  
Saved processed dataset to: D:\\VS Codes\\Projects\\IPL-Score-Prediction-

## Final Task

### Subtask:
Confirm that Google Drive has been successfully mounted and is accessible.


## Summary:

### Data Analysis Key Findings
*   Google Drive was successfully mounted, confirming accessibility for file operations.
*   Four IPL delivery CSV files were successfully loaded and combined:
    *   `ipl_2025_deliveries.csv` (17,246 rows, 21 columns)
    *   `ipl_2024_deliveries.csv` (17,053 rows, 20 columns)
    *   `ipl_2023_deliveries.csv` (17,386 rows, 20 columns)
    *   `ipl_2022_deliveries.csv` (17,912 rows, 20 columns)
*   The combined DataFrame, `combined_df`, has a total of 69,597 rows and 21 columns.

### Insights or Next Steps
*   The combined dataset is ready for comprehensive analysis of IPL delivery data across the years 2022-2025, enabling year-over-year comparisons and trend identification.
*   Further data cleaning, such as handling missing values or inconsistencies in columns (e.g., the difference in column count between `ipl_2025_deliveries.csv` and the other files), should be performed before detailed analysis.
