In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/DACON/6.클릭율 예측 대회

/content/drive/MyDrive/Colab Notebooks/DACON/6.클릭율 예측 대회


In [3]:
!pip install -qq catboost==1.2.5
!pip install -qq lightgbm==4.3.0
!pip install -qq imblearn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import random
import time
import bisect
from tqdm import tqdm
from joblib import Memory

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [5]:
cachedir = './cache/'
memory = Memory(cachedir, verbose=0)

@memory.cache
def preprocess_data(train_path, test_path, model_type='LightGBM', version=1, test_size=0.25, random_state=42):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    num_cols = ['F04', 'F06', 'F11', 'F14', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
    cat_cols = ['ID', 'F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13', 'F15', 'F16', 'F17',
                'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28', 'F30', 'F31', 'F34', 'F35', 'F37', 'F39']

    if model_type == 'LightGBM':
        columns_to_drop = ['ID', 'F35', 'F23', 'F38']
    elif model_type == 'CatBoost':
        columns_to_drop = ['ID', 'F30', 'F22', 'F23']

    train = train.drop(columns_to_drop, axis=1)
    test = test.drop(columns_to_drop, axis=1)

    if version == 1:
        num_cols = [col for col in num_cols if col not in columns_to_drop]
        cat_cols = [col for col in cat_cols if col not in columns_to_drop]

        train[num_cols] = train[num_cols].fillna(0)
        test[num_cols] = test[num_cols].fillna(0)
        train[cat_cols] = train[cat_cols].fillna('-1')
        test[cat_cols] = test[cat_cols].fillna('-1')

    elif version == 2:
        train = train.dropna(subset=['F32', 'F29', 'F36', 'F24'], how='all')
        threshold = 10
        train = train.dropna(thresh=train.shape[1] - threshold + 1)


        num_cols = [col for col in num_cols if col not in columns_to_drop]
        cat_cols = [col for col in cat_cols if col not in columns_to_drop]

        train[num_cols] = train[num_cols].fillna(0)
        test[num_cols] = test[num_cols].fillna(0)
        train[cat_cols] = train[cat_cols].fillna('-1')
        test[cat_cols] = test[cat_cols].fillna('-1')
        train = train.drop_duplicates()

    bit_cols = train.select_dtypes(include=['float64', 'int64']).columns
    train[bit_cols] = train[bit_cols].astype('int32')
    bit_cols = bit_cols.drop('Click')
    test[bit_cols] = test[bit_cols].astype('int32')

    train[cat_cols] = train[cat_cols].astype('category')
    test[cat_cols] = test[cat_cols].astype('category')

    if model_type == 'LightGBM':
        encoders = {}
        for feature in tqdm(cat_cols, desc="Encoding features"):
            le = LabelEncoder()
            train[feature] = le.fit_transform(train[feature].astype(str))
            le_classes_set = set(le.classes_)
            test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
            le_classes = le.classes_.tolist()
            bisect.insort_left(le_classes, '-1')
            le.classes_ = np.array(le_classes)
            test[feature] = le.transform(test[feature].astype(str))
            encoders[feature] = le

    X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Click', axis=1), train['Click'], test_size=test_size, random_state=random_state, stratify=train['Click'])

    return X_train, X_valid, y_train, y_valid, test, cat_cols

X_train_l1, X_valid_l1, y_train_l1, y_valid_l1, test_l1, _ = preprocess_data('data/train.csv', 'data/test.csv', model_type='LightGBM', version=1)
X_train_l2, X_valid_l2, y_train_l2, y_valid_l2, test_l2, _ = preprocess_data('data/train.csv', 'data/test.csv', model_type='LightGBM', version=2)

X_train_c1, X_valid_c1, y_train_c1, y_valid_c1, test_c1, cat_cols_c1 = preprocess_data('data/train.csv', 'data/test.csv', model_type='CatBoost', version=1)
X_train_c2, X_valid_c2, y_train_c2, y_valid_c2, test_c2, cat_cols_c2 = preprocess_data('data/train.csv', 'data/test.csv', model_type='CatBoost', version=2)

In [6]:
def undersample_data(X_train, y_train, random_state=42):
    rus = RandomUnderSampler(random_state=random_state)
    X_train_new, y_train_new = rus.fit_resample(X_train, y_train)
    return X_train_new, y_train_new

# CatBoost

In [7]:
class RFLBinaryObjective:
    """
    출처: https://github.com/Luojiaqimath/Robust-GBDT/blob/main/rfl_loss.py
    참고: https://github.com/catboost/catboost/blob/master/catboost/tutorials/custom_loss/custom_loss_and_metric_tutorial.ipynb
    """
    def __init__(self, r, q, clip=False):
        self.r = r
        self.q = q
        self.clip = clip
        self.epsilon = 1e-16

    def calc_ders_range(self, preds, labels, weights):
        pt = self.sigmoid(preds)
        pt[labels==0] = 1-pt[labels==0]
        grad = (2*labels-1) * self.grad(pt)
        hess = self.hess1(pt) + self.hess2(pt)

        if self.clip:
            hess = np.maximum(hess, self.epsilon)

        return list(zip(grad, hess))

    def grad(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p) & (p<1)]
        if self.r>0 and self.q>0:
            result[(0<p) & (p<1)] = (self.r*p1*((1-p1)**self.r)*(p1**self.q-1)-self.q*(p1**self.q)*(1-p1)**(self.r+1))/self.q
        elif self.r==0 and self.q>0:
            result[(0<p) & (p<1)] = (p1**self.q)*(p1-1)
        elif self.r>0 and self.q==0:
            p1 = np.clip(p1, 1e-9, 1)
            result[(0<p) & (p<1)] = ((1-p1)**self.r)*(self.r*p1*np.log(p1)+p1-1)
        else:
            result[(0<p) & (p<1)] = p1-1
        return result

    def hess1(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p) & (p<1)]
        if self.r>0 and self.q>0:
            result[(0<p) & (p<1)] = ((1-p1)**self.r)*(-self.r*(self.r-1)*(p1**self.q-1)*p1**2-self.q*(self.q-1)*(p1**self.q)*(p1-1)**2-2*self.r*self.q*p1**(self.q+1)*(p1-1))/self.q
        elif self.r==0 and self.q>0:
            result[(0<p) & (p<1)] = (1-self.q)*(p1**self.q)*(p1-1)**2
        elif self.r>0 and self.q==0:
            p1 = np.clip(p1, 1e-12, 1)
            result[(0<p) & (p<1)] = ((1-p1)**self.r)*(-self.r*(self.r-1)*p1**2*np.log(p1)-2*self.r*p1*(p1-1)+(p1-1)**2)
        else:
            result[(0<p) & (p<1)] = (p1-1)**2
        return result

    def hess2(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p) & (p<1)]
        if self.r>0 and self.q>0:
            result[(0<p) & (p<1)] = (1-2*p1)*(self.r*((1-p1)**self.r)*(p1**self.q-1)*p1-self.q*(p1**self.q)*((1-p1)**(self.r+1)))/self.q
        elif self.r==0 and self.q>0:
            result[(0<p) & (p<1)] = -(p1**self.q)*(p1-1)*(2*p1-1)
        elif self.r>0 and self.q==0:
            p1 = np.clip(p1, 1e-12, 1)
            result[(0<p) & (p<1)] = ((1-p1)**self.r)*(1-2*p1)*(self.r*p1*np.log(p1)+p1-1)
        else:
            result[(0<p) & (p<1)] = (1-p1)*(2*p1-1)
        return result

    def sigmoid(self, x):
        x = np.minimum(-x, 88.7)
        return 1 / (1 + np.exp(x) + self.epsilon)

In [26]:
def Catboost_tp_func(train_data, train_labels, valid_data, valid_labels, test_data, cat_features, snapshot_file, custom_loss=None):
    if custom_loss:
        model = CatBoostClassifier(
            iterations=10000,
            custom_metric=['AUC'],
            eval_metric='AUC',
            learning_rate=0.3,
            verbose=1,
            random_seed=42,
            save_snapshot=True,
            snapshot_file=snapshot_file,
            loss_function=custom_loss
        )
    else:
        model = CatBoostClassifier(
            iterations=1,
            custom_metric=['AUC'],
            eval_metric='AUC',
            learning_rate=0.3,
            verbose=5,
            random_seed=42,
            save_snapshot=True,
            snapshot_file=snapshot_file,
            task_type='GPU',
            devices='0',
        )

    model.fit(
        train_data, train_labels,
        eval_set=(valid_data, valid_labels),
        cat_features=cat_features,
        use_best_model=True,
        early_stopping_rounds=10
    )

    return model.predict_proba(test_data)[:, 1]

In [None]:
X_train_new_c1, y_train_new_c1 = undersample_data(X_train_c1, y_train_c1)

p1 = Catboost_tp_func(X_train_new_c1,
                      y_train_new_c1,
                      X_valid_c1,
                      y_valid_c1,
                      test_c1,
                      cat_cols_c1,
                      'model_snapshot(LogLoss-rus-v1-t).model'
                      )

In [None]:
# p2 = Catboost_tp_func(X_train_c1,
#                       y_train_c1,
#                       X_valid_c1,
#                       y_valid_c1,
#                       test_c1,
#                       cat_cols_c1,
#                       'model_snapshot(RFL-v1-t).model',
#                       custom_loss=RFLBinaryObjective(r=1.0, q=0.5, clip=True)
#                       )

In [None]:
X_train_new_c2, y_train_new_c2 = undersample_data(X_train_c2, y_train_c2)

p3 = Catboost_tp_func(X_train_new_c2,
                      y_train_new_c2,
                      X_valid_c2,
                      y_valid_c2,
                      test_c2,
                      cat_cols_c2,
                      'model_snapshot(LogLoss-rus-v2-t).model'
                      )

In [None]:
p4 = Catboost_tp_func(X_train_c2,
                      y_train_c2,
                      X_valid_c2,
                      y_valid_c2,
                      test_c2,
                      cat_cols_c2,
                      'model_snapshot(RFL-v2-t).model',
                      custom_loss=RFLBinaryObjective(r=1.0, q=0.5, clip=True)
                      )

# LGBM

In [13]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd

In [14]:
class RFLBinary():
    """
    출처: https://github.com/Luojiaqimath/Robust-GBDT/blob/main/rfl_loss.py
    """
    def __init__(self, r, q, sklearn=False, clip=False):
        self.r = r
        self.q = q
        self.clip = clip
        self.sklearn = sklearn
        self.epsilon = 1e-16

    def __call__(self, labels, preds):

        pt = self.sigmoid(preds)
        pt[labels==0] = 1-pt[labels==0]
        grad = np.zeros(preds.shape[0])
        hess = np.zeros(preds.shape[0])

        grad = (2*labels-1)*self.grad(pt)
        hess = self.hess1(pt)+self.hess2(pt)

        if self.clip:
            return grad, np.maximum(hess, self.epsilon)
        else:
            return grad, hess

    def grad(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p)&(p<1)]
        if self.r > 0 and self.q > 0:
            result[(0<p)&(p<1)] = (self.r*p1*((1-p1)**self.r)*(p1**self.q-1)-\
                self.q*(p1**self.q)*(1-p1)**(self.r+1))/self.q
        elif self.r == 0 and self.q > 0:
            result[(0<p)&(p<1)] = (p1**self.q)*(p1-1)
        elif self.r > 0 and self.q == 0:
            p1 = np.clip(p1, 1e-9, 1)
            result[(0<p)&(p<1)] = ((1-p1)**self.r)*(self.r*p1*np.log(p1)+p1-1)
        else:
            result[(0<p)&(p<1)] = p1-1
        return result

    def hess1(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p)&(p<1)]
        if self.r > 0 and self.q > 0:
            result[(0<p)&(p<1)] = ((1-p1)**self.r)*(-self.r*(self.r-1)*(p1**self.q-1)*p1**2-\
                self.q*(self.q-1)*(p1**self.q)*(p1-1)**2-2*self.r*self.q*p1**(self.q+1)*(p1-1))/self.q
        elif self.r == 0 and self.q > 0:
            result[(0<p)&(p<1)] = (1-self.q)*(p1**self.q)*(p1-1)**2
        elif self.r > 0 and self.q == 0:
            p1 = np.clip(p1, 1e-12, 1)
            result[(0<p)&(p<1)] = ((1-p1)**self.r)*(-self.r*(self.r-1)*p1**2*np.log(p1)-\
                2*self.r*p1*(p1-1)+(p1-1)**2)
        else:
            result[(0<p)&(p<1)] = (p1-1)**2
        return result

    def hess2(self, p):
        result = np.zeros(p.shape)
        p1 = p[(0<p)&(p<1)]
        if self.r > 0 and self.q > 0:
            result[(0<p)&(p<1)] = (1-2*p1)*(self.r*((1-p1)**self.r)*(p1**self.q-1)*p1-\
                self.q*(p1**self.q)*((1-p1)**(self.r+1)))/self.q
        elif self.r == 0 and self.q > 0:
            result[(0<p)&(p<1)] = -(p1**self.q)*(p1-1)*(2*p1-1)
        elif self.r > 0 and self.q == 0:
            p1 = np.clip(p1, 1e-12, 1)
            result[(0<p)&(p<1)] = ((1-p1)**self.r)*(1-2*p1)*(self.r*p1*np.log(p1)+p1-1)
        else:
            result[(0<p)&(p<1)] = (1-p1)*(2*p1-1)
        return result

    def sigmoid(self, x):
        x = np.minimum(-x, 88.7)
        return 1 / (1 + np.exp(x)+self.epsilon)

In [15]:
def sigmoid(x):
    kEps = 1e-16
    x = np.minimum(-x, 88.7)
    return 1 / (1 + np.exp(x)+kEps)

def predict_proba(model, X):
    prediction = model.predict(X)
    prediction_probabilities = sigmoid(prediction).reshape(-1, 1)
    prediction_probabilities = np.concatenate((1 - prediction_probabilities, prediction_probabilities), 1)
    return prediction_probabilities

In [27]:
def LGBM_tp_func(train_data, train_labels, valid_data, valid_labels, test_data, stopping_rounds=110, objective=None):
    LGBM = lgb.LGBMClassifier(
        objective=objective if objective else 'binary',
        n_estimators=10000,
        random_state=42,
        device='gpu'
    )

    LGBM.fit(
        train_data, train_labels,
        eval_set=[(valid_data, valid_labels)],
        callbacks=[lgb.early_stopping(stopping_rounds=stopping_rounds)],
        eval_metric='auc',
    )

    if not objective:
        return LGBM.predict_proba(test_data)[:, 1]
    else:
        return predict_proba(LGBM, test_data)[:, 1]

In [None]:
X_train_new_l1, y_train_new_l1 = undersample_data(X_train_l1, y_train_l1)

p5 = LGBM_tp_func(X_train_new_l1, y_train_new_l1, X_valid_l1, y_valid_l1, test_l1)

In [None]:
# p6 = LGBM_tp_func(X_train_l1, y_train_l1, X_valid_l1, y_valid_l1, test_l1, objective=RFLBinary(r=1.0, q=0.5, clip=True))

In [None]:
X_train_new_l2, y_train_new_l2 = undersample_data(X_train_l2, y_train_l2)

p7 = LGBM_tp_func(X_train_new_l2, y_train_new_l2, X_valid_l2, y_valid_l2, test_l2)

In [None]:
# p8 = LGBM_tp_func(X_train_l2, y_train_l2, X_valid_l2, y_valid_l2, test_l2, objective=RFLBinary(r=1.0, q=0.5, clip=True))

In [24]:
submission = pd.read_csv('data/sample_submission.csv')
submission['Click'] = (p1 + (3*p3) + p4 + p5 + p7) / 7
submission.to_csv('submit/soft_ens.csv', index=False)