In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_theme(
    style="whitegrid",       # Background style ("whitegrid", "darkgrid", etc.)
    palette="deep",          # Default color palette ("deep", "muted", "bright", etc.)
    font="sans-serif",       # Font family
    font_scale=1.1,          # Scale font size slightly
    rc={"figure.figsize": (8, 5)}  # Default figure size
)

In [None]:
dataset_path = Path("../datasets")

train_identity = pd.read_csv(dataset_path / "train_identity.csv")
train_tx = pd.read_csv(dataset_path / "train_transaction.csv")

test_identity = pd.read_csv(dataset_path / "test_identity.csv")
test_tx = pd.read_csv(dataset_path / "test_transaction.csv")

In [None]:
train_all_cols = pd.merge(train_tx, train_identity, on='TransactionID', how='left')
# test = pd.merge(train_tx, train_identity, on='TransactionID', how='left')

X =  train_all_cols.drop(columns=['isFraud', 'TransactionID'])
y = train_all_cols['isFraud']
X = X.fillna(-999) #* for lightgbm to handl

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

In [4]:
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [5]:
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_val[cat_cols] = X_val[cat_cols].astype('category')

In [1]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', 'passthrough', cat_cols)  # pass cat columns as-is
])

pipeline = Pipeline([('preprocessor', preprocessor)])
# X_train_ = pipeline.fit_transform(X_train)
# X_val_ = pipeline.transform(X_val)

NameError: name 'ColumnTransformer' is not defined

In [13]:
# Create dataset for LGB
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

# Train
params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': -1
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_val],
    num_boost_round=100,
)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: ProductCD: object, card4: object, card6: object, P_emaildomain: object, R_emaildomain: object, M1: object, M2: object, M3: object, M4: object, M5: object, M6: object, M7: object, M8: object, M9: object, id_12: object, id_15: object, id_16: object, id_23: object, id_27: object, id_28: object, id_29: object, id_30: object, id_31: object, id_33: object, id_34: object, id_35: object, id_36: object, id_37: object, id_38: object, DeviceType: object, DeviceInfo: object

In [12]:
??lgb.train

[0;31mSignature:[0m
[0mlgb[0m[0;34m.[0m[0mtrain[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain_set[0m[0;34m:[0m [0mlightgbm[0m[0;34m.[0m[0mbasic[0m[0;34m.[0m[0mDataset[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_boost_round[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalid_sets[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mlightgbm[0m[0;34m.[0m[0mbasic[0m[0;34m.[0m[0mDataset[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalid_names[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeval[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mCallable[0m[0;34m[[0m[0;34