In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import datetime

# Load the dataset

In [None]:
file_path = '/mnt/data/IPL Matches 2008-2020.csv'
data = pd.read_csv(file_path)

# Step 1: Handle missing values
# Fill missing categorical values with 'Unknown'

In [None]:
data['city'].fillna('Unknown', inplace=True)
data['player_of_match'].fillna('Unknown', inplace=True)
data['winner'].fillna('Unknown', inplace=True)
data['result'].fillna('No Result', inplace=True)
data['eliminator'].fillna('Unknown', inplace=True)
data['method'].fillna('Unknown', inplace=True)

# Fill missing numerical values with median

In [None]:
data['result_margin'].fillna(data['result_margin'].median(), inplace=True)

# Step 2: Transform 'date' column to datetime format

In [None]:
data['date'] = pd.to_datetime(data['date'])

# Step 3: Encoding categorical columns

In [None]:
categorical_columns = ['city', 'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'eliminator', 'method']
numerical_columns = ['result_margin']

# One-hot encoding for categorical features and scaling for numerical features

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

# Step 4: Build the pipeline


In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


In [None]:
# Apply transformations


In [None]:
processed_data = pipeline.fit_transform(data)

In [None]:
# Save the processed data
processed_df = pd.DataFrame(
    processed_data.toarray(),
    columns=(
        numerical_columns + list(pipeline.named_steps['preprocessor']
                                  .transformers_[1][1]
                                  .get_feature_names_out(categorical_columns))
    )
)
processed_df.to_csv('processed_IPL_data.csv', index=False)

In [None]:
print("ETL process completed. Processed data saved as 'processed_IPL_data.csv'.")