# T20i Data Cleaning & Feature Engineering 

In [25]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv

In [3]:
load_dotenv()

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 4
python-dotenv could not parse statement starting at line 11


True

In [4]:
DB_USER = os.getenv("DB_USER")       
DB_PASSWORD = os.getenv("DB_PASSWORD")  
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

In [5]:
engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

## Load Dataset

In [6]:
df = pd.read_sql("Select * from t20i_data", con = engine)

In [7]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,playing role_striker,major teams_striker,image url_striker,full name_bowler,country_bowler,batting style_bowler,bowling style_bowler,playing role_bowler,major teams_bowler,image url_bowler
0,1001351,2016/17,2017-02-19,"Simonds Stadium, South Geelong",2,8.5,Sri Lanka,Australia,DAS Gunaratne,CK Kapugedera,...,Batter,"Sri Lanka, Sri Lanka Army Sports Club, Chattog...","https://img1.hscicdn.com/image/upload/f_auto,t...",Patrick James Cummins,Australia,Right hand Bat,Right arm Fast,Bowler,"Australia, San Francisco Unicorns, Sunrisers H...","https://img1.hscicdn.com/image/upload/f_auto,t..."
1,1001351,2016/17,2017-02-19,"Simonds Stadium, South Geelong",2,8.6,Sri Lanka,Australia,CK Kapugedera,DAS Gunaratne,...,Middle order Batter,"Sri Lanka, Chennai Super Kings, Colombo Cricke...","https://img1.hscicdn.com/image/upload/f_auto,t...",Patrick James Cummins,Australia,Right hand Bat,Right arm Fast,Bowler,"Australia, San Francisco Unicorns, Sunrisers H...","https://img1.hscicdn.com/image/upload/f_auto,t..."
2,1001351,2016/17,2017-02-19,"Simonds Stadium, South Geelong",2,9.1,Sri Lanka,Australia,DAS Gunaratne,CK Kapugedera,...,Batter,"Sri Lanka, Sri Lanka Army Sports Club, Chattog...","https://img1.hscicdn.com/image/upload/f_auto,t...",James Peter Faulkner,Australia,Right hand Bat,Left arm Fast medium,Bowling Allrounder,"Australia, Australia A, Australia Under-19s, G...","https://img1.hscicdn.com/image/upload/f_auto,t..."
3,1001351,2016/17,2017-02-19,"Simonds Stadium, South Geelong",2,9.2,Sri Lanka,Australia,CK Kapugedera,DAS Gunaratne,...,Middle order Batter,"Sri Lanka, Chennai Super Kings, Colombo Cricke...","https://img1.hscicdn.com/image/upload/f_auto,t...",James Peter Faulkner,Australia,Right hand Bat,Left arm Fast medium,Bowling Allrounder,"Australia, Australia A, Australia Under-19s, G...","https://img1.hscicdn.com/image/upload/f_auto,t..."
4,1001351,2016/17,2017-02-19,"Simonds Stadium, South Geelong",2,9.3,Sri Lanka,Australia,CK Kapugedera,DAS Gunaratne,...,Middle order Batter,"Sri Lanka, Chennai Super Kings, Colombo Cricke...","https://img1.hscicdn.com/image/upload/f_auto,t...",James Peter Faulkner,Australia,Right hand Bat,Left arm Fast medium,Bowling Allrounder,"Australia, Australia A, Australia Under-19s, G...","https://img1.hscicdn.com/image/upload/f_auto,t..."


In [8]:
print("Raw Shape:", df.shape)

Raw Shape: (415525, 58)


## Drop Duplicates

In [9]:
df = df.drop_duplicates()

## Drop irrelevant columns (images, URLs, etc.)

In [10]:
df = df.drop(columns=['image url_striker','image url_bowler'], errors='ignore')

## Handle missing values

In [11]:
df['runs_off_bat'] = df['runs_off_bat'].fillna(0)
df['extras'] = df['extras'].fillna(0)
df['wicket'] = df['wicket'].fillna(0)
df['winner'] = df['winner'].fillna("Unknown")

## Clean text fields

In [12]:
df['batting_team'] = df['batting_team'].str.strip()
df['bowling_team'] = df['bowling_team'].str.strip()

In [13]:
print("After Cleaning:", df.shape)

After Cleaning: (415525, 56)


# MATCH-LEVEL FEATURES

In [14]:
match_summary = df.groupby('match_id').agg({
    'runs_off_bat': 'sum',
    'extras': 'sum',
    'wicket': 'sum',
    'winner': 'first',
    'venue': 'first',
    'season': 'first',
    'toss_winner': 'first',
    'toss_decision': 'first',
    'batting_team': lambda x: list(x.unique()),
    'bowling_team': lambda x: list(x.unique())
}).reset_index()

In [15]:
match_summary.rename(columns={
    'runs_off_bat':'total_runs',
    'extras':'total_extras',
    'wicket':'total_wickets'
}, inplace=True)

# OVER-LEVEL FEATURES

In [17]:
df['over'] = df['ball'].astype(int)

In [None]:
over_summary = df.groupby(['match_id','innings','over','batting_team']).agg({
    'runs_off_bat':'sum',
    'extras':'sum',
    'wicket':'sum'
}).reset_index()

## Add cumulative stats per innings

In [19]:
over_summary['cumulative_runs'] = over_summary.groupby(['match_id','innings'])['runs_off_bat'].cumsum()
over_summary['cumulative_wickets'] = over_summary.groupby(['match_id','innings'])['wicket'].cumsum()
over_summary['run_rate'] = over_summary['cumulative_runs'] / over_summary['over']

## MERGE WITH MATCH METADATA

In [20]:
over_with_meta = over_summary.merge(
    match_summary[['match_id','winner','venue','season','toss_winner','toss_decision']],
    on='match_id', how='left'
)

# FEATURE ENGINEERING

In [21]:
MAX_OVERS = 20   
over_with_meta['overs_left'] = MAX_OVERS - over_with_meta['over']

## Target runs (per match)

In [22]:
target_runs_per_match = over_with_meta.groupby('match_id')['cumulative_runs'].max().reset_index()
target_runs_per_match.rename(columns={'cumulative_runs':'target_runs'}, inplace=True)

In [23]:
over_with_meta = over_with_meta.merge(target_runs_per_match, on='match_id', how='left')

## Required runs & run rate (only for 2nd innings)

In [26]:
over_with_meta['required_runs'] = np.where(
    over_with_meta['innings']==2,
    over_with_meta['target_runs'] - over_with_meta['cumulative_runs'],
    np.nan
)

In [27]:
over_with_meta['required_run_rate'] = np.where(
    (over_with_meta['innings']==2) & (over_with_meta['overs_left']>0),
    over_with_meta['required_runs'] / over_with_meta['overs_left'],
    np.nan
)

In [28]:
print("Final Over-Level Data Shape:", over_with_meta.shape)

Final Over-Level Data Shape: (67329, 19)


In [30]:
match_summary.to_csv("../data/T20i/t20i_match_summary.csv", index=False)
over_with_meta.to_csv("../data/T20i/t20i_over_features.csv", index=False)

print("Cleaning & Feature Engineering complete. Files saved.")

Cleaning & Feature Engineering complete. Files saved.
