In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import torch, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
df = pd.read_csv("data/train.csv")
X = df.drop(columns=["id","y"])
y = df["y"]
test_df = pd.read_csv("data/test.csv")
XTEST = test_df.drop(columns=["id"])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [4]:
df.describe()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,216506.495284,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,0.0,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,187499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,374999.5,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,562499.25,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,749999.0,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0


In [5]:
df.select_dtypes("object").nunique()

job          12
marital       3
education     4
default       2
housing       2
loan          2
contact       3
month        12
poutcome      4
dtype: int64

In [6]:
num_cols = X.select_dtypes("number").columns.tolist()
cat_cols = X.select_dtypes("object").columns.tolist()

In [7]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, inputs, targets):
        eps = 1e-8
        inputs = inputs.clamp(min=eps, max=1. - eps) 
        
        BCE = - (targets * torch.log(inputs) + (1 - targets) * torch.log(1 - inputs))
        pt = torch.where(targets==1, inputs, 1 - inputs)
        focal = self.alpha * (1-pt)**self.gamma*BCE
        return focal.mean()

In [8]:
class FFNN(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim,256); self.drop1=nn.Dropout(0.2)
        self.fc2 = nn.Linear(256,128); self.drop2=nn.Dropout(0.2)
        self.fc3 = nn.Linear(256+128,64); self.drop3=nn.Dropout(0.2)
        self.out = nn.Linear(64,1)
    def forward(self,x, sigmoid=True):
        x1 = F.relu(self.fc1(x)); x1=self.drop1(x1)
        x2 = F.relu(self.fc2(x1)); x2=self.drop2(x2)
        x_cat = torch.cat([x2,x1],1)
        x3 = F.relu(self.fc3(x_cat)); x3=self.drop3(x3)
        if sigmoid:
            return torch.sigmoid(self.out(x3))
        else:
            return self.out(x3)


In [9]:
def train_ffnn_oof(X_tr, y_tr, X_va, sigmoid = True):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = FFNN(X_tr.shape[1]).to(device)
    opt   = optim.AdamW(model.parameters(), lr=1e-3)
    crit  = FocalLoss() 
    ds = TensorDataset(
        torch.tensor(X_tr, dtype=torch.float32),
        torch.tensor(np.asarray(y_tr), dtype=torch.float32).unsqueeze(1)
    )
    dl = DataLoader(ds, batch_size=768, shuffle=True)
    model.train()
    for _ in range(10):
        for xb,yb in dl:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = crit(model(xb), yb)
            loss.backward(); opt.step()

    model.eval()
    with torch.no_grad():
        if sigmoid:
            preds = model(torch.tensor(
                X_va, dtype=torch.float32).to(device)
            ).squeeze().cpu().numpy()
        else: 
            preds = model(torch.tensor(
                X_va, dtype=torch.float32).to(device), sigmoid=False
            ).squeeze().cpu().numpy()
    return preds, model


In [10]:
prep_tree = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="drop"
    
)
prep_nn = ColumnTransformer(
    transformers= [
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler())   
        ]),num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="drop"
)

In [11]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_ffnn = np.zeros(len(X), dtype=np.float32)
oof_lgb  = np.zeros(len(X), dtype=np.float32)
oof_cat  = np.zeros(len(X), dtype=np.float32)

In [12]:
lgb_test_preds = []
cat_test_preds = []
ffnn_test_preds = []

for fold,(tr_idx,va_idx) in enumerate(kf.split(X, y)):
    print("Fold", fold+1)
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # -------- FFNN --------
    X_tr_nn = prep_nn.fit_transform(X_tr)
    X_va_nn = prep_nn.transform(X_va)
    XTEST_NN = prep_nn.transform(XTEST)

    oof_ffnn[va_idx], ffnn_model = train_ffnn_oof(X_tr_nn, y_tr, X_va_nn, sigmoid=False)
    with torch.no_grad():
        Xt = torch.tensor(np.asarray(XTEST_NN), dtype=torch.float32).to(device)
        ffnn_test_preds.append(ffnn_model(Xt, sigmoid=False).cpu().squeeze().numpy())


    # -------- LightGBM --------
    Xtr_tree = prep_tree.fit_transform(X_tr)
    Xva_tree = prep_tree.transform(X_va)
    XTEST_TREE = prep_tree.transform(XTEST)

    m1 = lgb.LGBMClassifier(
        objective="binary", metric="auc",
        boosting_type="gbdt", device="gpu",
        n_estimators=20000, learning_rate=0.03,
        num_leaves=128, min_data_in_leaf=250,
        feature_fraction=0.8, bagging_fraction=0.8,
        bagging_freq=1, lambda_l2=3.0, random_state=42,
    )
    m1.fit(Xtr_tree, y_tr,
           eval_set=[(Xva_tree, y_va)],
           eval_metric="auc",
           callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
    oof_lgb[va_idx] = np.asarray(m1.predict(Xva_tree, raw_score=True))
    lgb_test_preds.append(np.asarray(m1.predict(XTEST_TREE, raw_score=True)))

    # -------- CatBoost --------
    prep_cat = SimpleImputer(strategy="median").fit(X_tr[num_cols])
    Xtr_cb = X_tr.copy();   Xtr_cb[num_cols] = prep_cat.transform(X_tr[num_cols])
    Xva_cb = X_va.copy();   Xva_cb[num_cols] = prep_cat.transform(X_va[num_cols])
    XTEST_CAT = XTEST.copy(); XTEST_CAT[num_cols] = prep_cat.transform(XTEST[num_cols])

    for c in cat_cols:
        Xtr_cb[c]    = Xtr_cb[c].astype("category")
        Xva_cb[c]    = Xva_cb[c].astype("category")
        XTEST_CAT[c] = XTEST_CAT[c].astype("category")

    cat_idx = [Xtr_cb.columns.get_loc(c) for c in cat_cols]

    m2 = CatBoostClassifier(
        task_type="GPU", devices="0",
        loss_function="Logloss", eval_metric="AUC",
        iterations=10000, learning_rate=0.03, depth=8,
        l2_leaf_reg=3.0, bagging_temperature=1.0,
        random_strength=0.2, border_count=128,
        early_stopping_rounds=300, verbose=False, random_state=42
    )
    m2.fit(Xtr_cb, y_tr, eval_set=(Xva_cb, y_va),
           use_best_model=True, cat_features=cat_idx)

    oof_cat[va_idx] = np.asarray(m2.predict(Xva_cb, prediction_type="RawFormulaVal"))
    cat_test_preds.append(np.asarray(m2.predict(XTEST_CAT, prediction_type="RawFormulaVal")))
    
# -------- fold ortalaması --------
ffnn_test_pred = np.mean(ffnn_test_preds, axis=0)
lgb_test_pred  = np.mean(lgb_test_preds,  axis=0)
cat_test_pred  = np.mean(cat_test_preds,  axis=0)

Fold 1
[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (6.87 MB) transferred to GPU in 0.006251 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1517]	valid_0's auc: 0.970082






Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2
[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1044
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (6.87 MB) transferred to GPU in 0.005577 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1284]	valid_0's auc: 0.968888






Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3
[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1042
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (6.87 MB) transferred to GPU in 0.005140 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1314]	valid_0's auc: 0.968839






Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4
[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (6.87 MB) transferred to GPU in 0.007850 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1350]	valid_0's auc: 0.969824






Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5
[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1043
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 51
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 11 dense feature groups (6.87 MB) transferred to GPU in 0.005190 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1490]	valid_0's auc: 0.969432






Default metric period is 5 because AUC is/are not implemented for GPU


In [13]:
X_meta = pd.DataFrame({
    "ffnn": oof_ffnn,
    "lgb":  oof_lgb,
    "cat":  oof_cat
})
y_meta = y.reset_index(drop=True)
        
pred_cols = ["pred_ffnn","pred_lgb","pred_cat"]

In [14]:
X_meta_tr, X_meta_test, y_meta_tr, y_meta_test = train_test_split(X_meta, 
                                                              y_meta, 
                                                              test_size=0.15, 
                                                              stratify=y_meta, 
                                                              random_state=42)

flips = {}
for c in X_meta_tr.columns:
    corr = np.corrcoef(X_meta_tr[c], y_meta_tr)[0, 1]
    flips[c] = 1 if (np.isnan(corr) or corr >= 0) else -1
flip_s = pd.Series(flips)
X_meta_tr = X_meta_tr.mul(flip_s, axis=1)
X_meta_te = X_meta_test.mul(flip_s, axis=1)

In [15]:
from sklearn.pipeline import make_pipeline
meta_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l2", C=0.5, solver="lbfgs",
        max_iter=2000, class_weight="balanced"
    )
)
meta_model.fit(X_meta_tr, y_meta_tr)

0,1,2
,steps,"[('standardscaler', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [19]:
hold_auc = roc_auc_score(y_meta_test, meta_model.predict_proba(X_meta_te)[:, 1])
print("Meta Holdout AUC:", hold_auc)

Meta Holdout AUC: 0.9691276455529401


In [17]:
X_final_test = pd.DataFrame({
    "ffnn": ffnn_test_pred,
    "lgb":  lgb_test_pred,
    "cat":  cat_test_pred
}).mul(flip_s, axis=1)

In [18]:
preds = meta_model.predict_proba(X_final_test)[:, 1]
submission = pd.DataFrame({"id": test_df["id"], "y": preds})
submission.to_csv("submission2.csv", index=False, float_format="%.9f")
print("submission2.csv yazıldı.")

submission2.csv yazıldı.
