In [6]:
# %% [markdown]
# # T20 World Cup 2024 Data Ingestion & Cleaning

# %%
import pandas as pd
import numpy as np
import sys
import os

# Add the 'src' directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))

from data_utils import load_data, save_data, clean_column_names

# %% [markdown]
# ### Load Raw Data

# %%
matches_filepath = '../data/raw/t20wc_2024_matches.csv'
player_stats_filepath = '../data/raw/t20wc_2024_player_stats.csv'

matches_df_raw = load_data(matches_filepath)
player_stats_df_raw = load_data(player_stats_filepath)

print("--- Raw Matches Data ---")
print(matches_df_raw.head())
print("\n")
print(matches_df_raw.info())

print("\n--- Raw Player Stats Data ---")
print(player_stats_df_raw.head())
print("\n")
print(player_stats_df_raw.info())

# %% [markdown]
# ### Initial Cleaning Steps

# %%
# Clean column names for easier access
matches_df = clean_column_names(matches_df_raw.copy())
player_stats_df = clean_column_names(player_stats_df_raw.copy())

print("\n--- Cleaned Column Names for Matches Data ---")
print(matches_df.columns)
print("\n--- Cleaned Column Names for Player Stats Data ---")
print(player_stats_df.columns)

# %% [markdown]
# ### Data Type Conversions

# %%
# Convert 'date' column to datetime objects
matches_df['date'] = pd.to_datetime(matches_df['date'], errors='coerce')

# Convert score and wicket columns to numeric, coercing errors to NaN
numeric_cols_matches = [
    'first_innings_score', 'second_innings_score',
    'first_innings_wickets', 'second_innings_wickets',
    'win_by_runs', 'win_by_wickets'
]
for col in numeric_cols_matches:
    matches_df[col] = pd.to_numeric(matches_df[col], errors='coerce')

# Check for missing values after conversion
print("\n--- Missing values in Matches Data after type conversion ---")
print(matches_df.isnull().sum())

print("\n--- Missing values in Player Stats Data (expected for non-bowlers/batsmen) ---")
print(player_stats_df.isnull().sum()) # player_stats might have NaNs for strike_rate/economy_rate if balls_faced/overs_bowled are 0

# %% [markdown]
# ### Handle Special Cases (e.g., tie matches, DLS)

# The simulated data for T20 World Cup 2024 already includes a tie handled by Super Over (Match 3: Namibia vs Oman).
# The 'win_by_runs' and 'win_by_wickets' columns clearly indicate how the match was won.
# For a tie, both 'win_by_runs' and 'win_by_wickets' are 0.

# Let's ensure 'win_by_runs' and 'win_by_wickets' correctly reflect wins.
# If a team won by runs, 'win_by_wickets' should be 0 (and vice-versa).
matches_df.loc[matches_df['win_by_runs'] > 0, 'win_by_wickets'] = 0
matches_df.loc[matches_df['win_by_wickets'] > 0, 'win_by_runs'] = 0

# Check for rows where both win_by_runs and win_by_wickets are 0 (tie or DLS scenario leading to no clear margin)
tied_matches = matches_df[(matches_df['win_by_runs'] == 0) & (matches_df['win_by_wickets'] == 0)]
print(f"\nNumber of tied/no clear margin matches: {len(tied_matches)}")
print(tied_matches[['match_id', 'team1', 'team2', 'winner', 'win_by_runs', 'win_by_wickets']])

# %% [markdown]
# ### Save Processed Data

# %%
processed_matches_filepath = '../data/processed/t20wc_2024_matches_cleaned.csv'
processed_player_stats_filepath = '../data/processed/t20wc_2024_player_stats_cleaned.csv'

save_data(matches_df, processed_matches_filepath)
save_data(player_stats_df, processed_player_stats_filepath)

print(f"\nCleaned matches data saved to: {processed_matches_filepath}")
print(f"Cleaned player stats data saved to: {processed_player_stats_filepath}")

# Verify saved data
verify_matches_df = load_data(processed_matches_filepath)
print("\n--- Verified Cleaned Matches Data Head ---")
print(verify_matches_df.head())
print("\n--- Verified Cleaned Matches Data Info ---")
print(verify_matches_df.info())

--- Raw Matches Data ---
   match_id        date       venue        team1             team2  \
0         1  2024-06-01      Dallas          USA            Canada   
1         2  2024-06-02  Providence  West Indies  Papua New Guinea   
2         3  2024-06-03  Georgetown      Namibia              Oman   
3         4  2024-06-03    New York    Sri Lanka      South Africa   
4         5  2024-06-04      Dallas  Afghanistan            Uganda   

    toss_winner toss_decision        winner  win_by_runs  win_by_wickets  \
0           USA         field           USA            0               7   
1   West Indies         field   West Indies            0               5   
2          Oman         field       Namibia            0               0   
3  South Africa         field  South Africa            0               6   
4   Afghanistan         field   Afghanistan          125               0   

     player_of_match  first_innings_score  second_innings_score  \
0        Aaron Jones          