In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_theme(
    style="whitegrid",       # Background style ("whitegrid", "darkgrid", etc.)
    palette="deep",          # Default color palette ("deep", "muted", "bright", etc.)
    font="sans-serif",       # Font family
    font_scale=1.1,          # Scale font size slightly
    rc={"figure.figsize": (8, 5)}  # Default figure size
)

In [3]:
dataset_path = Path("../datasets")

train_identity = pd.read_csv(dataset_path / "train_identity.csv")
train_tx = pd.read_csv(dataset_path / "train_transaction.csv")

test_identity = pd.read_csv(dataset_path / "test_identity.csv")
test_tx = pd.read_csv(dataset_path / "test_transaction.csv")

In [4]:
train_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')
# test = pd.merge(train_tx, train_identity, on='TransactionID', how='left')

X =  train_all_cols.drop(columns=['isFraud', 'TransactionID'])
y = train_all_cols['isFraud']
X = X.fillna(-999) #* for lightgbm to handl

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

In [5]:
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [6]:
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_val[cat_cols] = X_val[cat_cols].astype('category')

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', 'passthrough', cat_cols)  # pass cat columns as-is
])

pipeline = Pipeline([('preprocessor', preprocessor)])
# X_train_ = pipeline.fit_transform(X_train)
# X_val_ = pipeline.transform(X_val)

In [8]:
# Create dataset for LGB
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

# Train
params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_val],
    num_boost_round=100,
)

In [9]:
??lgb.train

[31mSignature:[39m
lgb.train(
    params: Dict[str, Any],
    train_set: lightgbm.basic.Dataset,
    num_boost_round: int = [32m100[39m,
    valid_sets: Optional[List[lightgbm.basic.Dataset]] = [38;5;28;01mNone[39;00m,
    valid_names: Optional[List[str]] = [38;5;28;01mNone[39;00m,
    feval: Union[Callable[[numpy.ndarray, lightgbm.basic.Dataset], Tuple[str, float, bool]], Callable[[numpy.ndarray, lightgbm.basic.Dataset], List[Tuple[str, float, bool]]], List[Union[Callable[[numpy.ndarray, lightgbm.basic.Dataset], Tuple[str, float, bool]], Callable[[numpy.ndarray, lightgbm.basic.Dataset], List[Tuple[str, float, bool]]]]], NoneType] = [38;5;28;01mNone[39;00m,
    init_model: Union[str, pathlib._local.Path, lightgbm.basic.Booster, NoneType] = [38;5;28;01mNone[39;00m,
    keep_training_booster: bool = [38;5;28;01mFalse[39;00m,
    callbacks: Optional[List[Callable]] = [38;5;28;01mNone[39;00m,
) -> lightgbm.basic.Booster
[31mSource:[39m   
[38;5;28;01mdef[39;00m train(
 