In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

from evaluate import evaluate

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_theme(
    style="whitegrid",       # Background style ("whitegrid", "darkgrid", etc.)
    palette="deep",          # Default color palette ("deep", "muted", "bright", etc.)
    font="sans-serif",       # Font family
    font_scale=1.1,          # Scale font size slightly
    rc={"figure.figsize": (8, 5)}  # Default figure size
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = Path("../datasets")

train_identity = pd.read_csv(dataset_path / "train_identity.csv")
train_tx = pd.read_csv(dataset_path / "train_transaction.csv")

test_identity = pd.read_csv(dataset_path / "test_identity.csv")
test_tx = pd.read_csv(dataset_path / "test_transaction.csv")

In [3]:
train_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')
# test = pd.merge(train_tx, train_identity, on='TransactionID', how='left')

X =  train_all_cols.drop(columns=['isFraud'])
y = train_all_cols['isFraud']
X = X.fillna(-999) #* for lightgbm to handl

In [4]:
print(f"X_shape: {X.shape}")

X_shape: (590540, 433)


In [5]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [6]:
X[cat_cols] = X[cat_cols].astype('category')

In [7]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42
)

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns(X_train, X_val, X_test):
    cat_cols = X_train.select_dtypes(include=['object']).columns
    for col in cat_cols:
        le = LabelEncoder()
        combined = pd.concat([X_train[col], X_val[col], X_test[col]], axis=0).astype(str)
        le.fit(combined)
        X_train[col] = le.transform(X_train[col].astype(str))
        X_val[col] = le.transform(X_val[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
    return X_train, X_val, X_test

def group_rare_categories(X_train, X_val, X_test, features, threshold=500):
    for col in features:
        freq = X_train[col].value_counts()
        rare = freq[freq < threshold].index
        for X in [X_train, X_val, X_test]:
            if col in X.columns:
                X[col] = X[col].replace(rare, 'Rare')
    return X_train, X_val, X_test

def create_transaction_amount_ratios(X_train, X_val, X_test, group_cols):
    for col in group_cols:
        # Train ratios
        train_means = X_train.groupby(col, observed=False)['TransactionAmt'].transform('mean')
        train_stds = X_train.groupby(col, observed=False)['TransactionAmt'].transform('std')
        X_train[f'TransactionAmt_to_mean_{col}'] = X_train['TransactionAmt'] / train_means
        X_train[f'TransactionAmt_to_std_{col}'] = X_train['TransactionAmt'] / train_stds

        # Use train stats for val and test
        means = X_train.groupby(col, observed=False)['TransactionAmt'].mean().to_dict()
        stds = X_train.groupby(col, observed=False)['TransactionAmt'].std().to_dict()

        for X in [X_val, X_test]:
            mapped_means = X[col].map(means).astype(float)
            mapped_stds = X[col].map(stds).astype(float)
            X[f'TransactionAmt_to_mean_{col}'] = X['TransactionAmt'] / mapped_means
            X[f'TransactionAmt_to_std_{col}'] = X['TransactionAmt'] / mapped_stds
    return X_train, X_val, X_test

def fill_missing_values(X, reference):
    numeric_cols = reference.select_dtypes(include=[np.number]).columns
    numeric_cols_existing = [col for col in numeric_cols if col in X.columns]
    X[numeric_cols_existing] = X[numeric_cols_existing].fillna(-1)

    cat_cols = reference.select_dtypes(include=['object']).columns
    cat_cols_existing = [col for col in cat_cols if col in X.columns]
    X[cat_cols_existing] = X[cat_cols_existing].fillna('missing')
    return X

def create_time_features(X):
    X['TransactionDT_days'] = X['TransactionDT'] / (24 * 60 * 60)
    X['Transaction_hour'] = ((X['TransactionDT'] / 3600) % 24).astype(int)
    X['Transaction_weekday'] = ((X['TransactionDT'] / (3600*24)) % 7).astype(int)
    X['is_weekend'] = (X['Transaction_weekday'] >= 5).astype(int)
    X['is_nighttime'] = ((X['Transaction_hour'] >= 0) & (X['Transaction_hour'] <= 5)).astype(int)
    return X

def drop_unused_columns(X):
    drop_cols = ['TransactionID', 'id_34']
    X = X.drop(columns=[col for col in drop_cols if col in X.columns], errors='ignore')

    extra_test_cols = ['id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27']
    X = X.drop(columns=[col for col in extra_test_cols if col in X.columns], errors='ignore')

    return X

def log_transform_transaction_amt(X):
    if 'TransactionAmt' in X.columns:
        X['TransactionAmt_log'] = np.log1p(X['TransactionAmt'])
    return X

def run_feature_engineering(X_train, X_val, X_test):
    print("🚧 Starting feature engineering pipeline...\n")

    X_train, X_val, X_test = encode_categorical_columns(X_train, X_val, X_test)
    print("✅ Encoded categorical columns")

    X_train, X_val, X_test = group_rare_categories(
        X_train, X_val, X_test,
        features=['P_emaildomain', 'R_emaildomain', 'id_30', 'id_31', 'id_33', 'card2', 'card5']
    )
    print("✅ Grouped rare categories")

    X_train, X_val, X_test = create_transaction_amount_ratios(X_train, X_val, X_test, ['card1', 'card4'])
    print("✅ Created transaction amount ratios")

    # Apply the rest of the transformations individually
    for name, X in zip(['Train', 'Validation', 'Test'], [X_train, X_val, X_test]):
        X = fill_missing_values(X, X_train)
        X = create_time_features(X)
        X = drop_unused_columns(X)
        X = log_transform_transaction_amt(X)
        print(f"✅ Completed processing {name} set")
        if name == 'Train':
            X_train = X
        elif name == 'Validation':
            X_val = X
        else:
            X_test = X

    print("\n🎯 Final Shapes:")
    print(f"📐 X_train shape: {X_train.shape}")
    print(f"📐 X_val shape:   {X_val.shape}")
    print(f"📐 X_test shape:  {X_test.shape}")

    return X_train, X_val, X_test

In [17]:
X_train, X_val, X_test = run_feature_engineering(X_train, X_val, X_test)

🚧 Starting feature engineering pipeline...

✅ Encoded categorical columns
✅ Grouped rare categories
✅ Created transaction amount ratios
✅ Completed processing Train set
✅ Completed processing Validation set
✅ Completed processing Test set

🎯 Final Shapes:
📐 X_train shape: (377945, 431)
📐 X_val shape:   (94487, 431)
📐 X_test shape:  (118108, 431)
