In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import DataLoader
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib
import importlib as imp
import torch
import torch.optim as optim
import torch.nn as nn
import time
from torchvision import transforms
from torch.utils.data import Dataset
import torch.nn.functional as F
import random
import rasterio
from rasterio.windows import Window
from sklearn.utils import shuffle

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold,StratifiedKFold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
SEED = 344
PATH_TRAIN = '/kaggle/input/icr-identify-age-related-conditions/train.csv'
PATH_TEST = '/kaggle/input/icr-identify-age-related-conditions/test.csv'
PATH_GREEKS = '/kaggle/input/icr-identify-age-related-conditions/greeks.csv'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True # Fix the network according to random seed
    print('Finish seeding with seed {}'.format(seed))
    
seed_everything(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Training on device {}'.format(device))

Finish seeding with seed 344
Training on device cpu


In [3]:
from sklearn.metrics import make_scorer

def balanced_log_loss(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    balanced_log_loss_score = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    return balanced_log_loss_score


balanced_log_loss_scorer = make_scorer(balanced_log_loss, greater_is_better=False, needs_proba=True)

In [4]:
def cross_val(classifier, features, targets, k=5):
    kf = KFold(n_splits=k)
    cv_loss = []
    for train_index, test_index in kf.split(features):
        classifier.fit(features[train_index], targets[train_index])
        predict = classifier.predict_proba(features[test_index])[:, 1]
        cv_loss.append(balanced_log_loss(targets[test_index].astype(np.int64), predict))
    print(cv_loss)
    print("CI: %0.2f (+/- %0.2f)" % (np.mean(cv_loss), np.std(cv_loss) * 2)) 

class ensembled_model:
    def __init__(self, models_array):
        self.models_array = models_array
    
    def fit(self, features, targets):
        for model in self.models_array:
            model.fit(features, targets)
    
    def predict_proba(self, features):
        predicts = []
        for model in self.models_array:
            predicts.append(model.predict_proba(features))
        return np.mean(predicts, axis=0)

# Data Processing

In [5]:
train_list = pd.read_csv(PATH_TRAIN)
test_list = pd.read_csv(PATH_TEST)
greeks_list = pd.read_csv(PATH_GREEKS)

In [6]:
cleaned_list = train_list.copy()
cleaned_list.loc[cleaned_list.loc[:, "EJ"] == 'B', 'EJ'] = 1.0
cleaned_list.loc[cleaned_list.loc[:, "EJ"] == 'A', 'EJ'] = 0.0
cleaned_list = cleaned_list.fillna(0)

greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'A', "Alpha"] = 0
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'B', "Alpha"] = 1
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'D', "Alpha"] = 2
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'G', "Alpha"] = 3
cleaned_greeks = np.array(greeks_list.loc[:, "Alpha"].values, np.int64)


id_array = cleaned_list.iloc[:,0].values
cleaned_array = np.array(cleaned_list.iloc[:,1:-1].values, np.float32)
cleaned_class = np.array(cleaned_list.iloc[:,-1].values, np.float32)

#cleaned_array = cleaned_list.drop(['Id', 'Class'], axis=1).astype('float32')
#cleaned_class = cleaned_list['Class'].astype('float32')

cleaned_array.shape, cleaned_class.shape

((617, 56), (617,))

# Model Training

In [7]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'learning_rate': [0.1],
#     'scale_pos_weight': [10, 12, 14, 15],
#     'n_estimators': [100],
#     'max_depth': [3],
#     'subsample': [0.5, 0.6, 0.7],
#     'colsample_bytree': [0.5],
#     'min_child_weight': [1],
#     'gamma': [0.01, 0.05, 0.1],
#     'reg_alpha': [0.8],
#     'reg_lambda': [0.8],
#     'seed': [344],
# }

# clf = xgb.XGBClassifier()

# grid_search = GridSearchCV(clf, param_grid, scoring=balanced_log_loss_scorer, cv=5, verbose=1)
# grid_search.fit(cleaned_array, cleaned_class)
# print("Best parameters:", grid_search.best_params_)
# print("Best Results:", grid_search.best_score_)

In [8]:
from sklearn.model_selection import cross_val_score

xgb_params = {
    'objective': 'binary:logistic',
    'scale_pos_weight': 14,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 3,
    'subsample': 0.6,
    'colsample_bytree': 0.5,
    'min_child_weight': 1,
    'gamma': 0.1,
    'reg_alpha': 0.8,
    'reg_lambda': 0.8,
    'seed': 344,
    }

clf = xgb.XGBClassifier(**xgb_params)
#clf = RandomForestClassifier(n_estimators=100, criterion='log_loss', oob_score=True)
scores = cross_val_score(clf, cleaned_array, cleaned_class, cv=5, scoring=balanced_log_loss_scorer)
print(scores)
print("CI: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[-0.13713109 -0.22461185 -0.26387109 -0.28019908 -0.22581908]
CI: -0.23 (+/- 0.10)


In [9]:
from lightgbm import LGBMClassifier

lgb_params = {
        'objective': 'binary', # 'binary', 'multiclass'
        'metric': None, # 'auc', 'multi_logloss'
        'boosting_type': 'goss',
        'learning_rate': 0.1,
        'num_leaves': 5,
        'colsample_bytree': 0.8,
        'subsample': 0.1,
        'reg_alpha': 0, 
        'reg_lambda': 0.8,
        'n_jobs': -1,
        'is_unbalance':True, 
        'verbose': -1,
        'seed': 344,
    }

In [10]:
from sklearn.model_selection import cross_val_score

clf = LGBMClassifier(**lgb_params)
#clf = RandomForestClassifier(n_estimators=100, criterion='log_loss', oob_score=True)
scores = cross_val_score(clf, cleaned_array, cleaned_class, cv=5, scoring=balanced_log_loss_scorer)
print(scores)
print("CI: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))  

[-0.15924708 -0.22781753 -0.31653464 -0.31169813 -0.23729676]
CI: -0.25 (+/- 0.12)


In [11]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
lgb_clf = LGBMClassifier(**lgb_params)
clf = ensembled_model([xgb_clf, lgb_clf])
cross_val(clf, cleaned_array, cleaned_class, k=5)

[0.16466450050906706, 0.3061835083987987, 0.28603570731555933, 0.2650673608142777, 0.2290475233797025]
CI: 0.25 (+/- 0.10)


# CV + Early Stopping + Averaging

In [12]:
def balanced_log_loss_metric(y_pred, y_true):
    y_true = y_true.get_label()
    y_true = y_true.astype(np.int64)
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    balanced_log_loss_score = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    
    return [('score', balanced_log_loss_score)]

In [22]:
xgb_params = {
    'objective': 'binary:logistic',
    'scale_pos_weight': 14,
    #'weights': [1, 14],
    'learning_rate': 0.1,
    'n_estimators': 200,
    'max_depth': 3,
    'subsample': 0.6,
    'colsample_bytree': 0.5,
    'min_child_weight': 1,
    'gamma': 0.1,
    'reg_alpha': 0.8,
    'reg_lambda': 0.8,
    'seed': 344,
    'early_stopping_rounds': 20,
    #'is_unbalance':True, 
    'feval': balanced_log_loss_metric,
    }

kf = KFold(n_splits=5, shuffle=True, random_state=14).split(cleaned_array)
#kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=34).split(cleaned_array, cleaned_greeks)
cv_loss = []
for train_index, test_index in kf:
    
    w_train = cleaned_class[train_index]
    w_train[w_train==0] = 1.214
    w_train[w_train==1] = 5.667
    w_test = cleaned_class[test_index]
    w_test[w_test==0] = 1.214
    w_test[w_test==1] = 5.667

    
    classifier = xgb.XGBClassifier(**xgb_params)
    eval_set = [(cleaned_array[test_index], cleaned_class[test_index])]
    
    classifier.fit(cleaned_array[train_index], cleaned_class[train_index],
                  eval_metric=balanced_log_loss_metric, eval_set=eval_set, verbose=10)
    predict = classifier.predict_proba(cleaned_array[test_index])[:, 1]
    cv_loss.append(balanced_log_loss(cleaned_class[test_index].astype(np.int64), predict))
print(cv_loss)
print("CI: %0.2f (+/- %0.2f)" % (np.mean(cv_loss), np.std(cv_loss) * 2)) 

[0]	validation_0-logloss:0.64965	validation_0-score:0.64106
[10]	validation_0-logloss:0.44646	validation_0-score:0.43997
[20]	validation_0-logloss:0.34521	validation_0-score:0.35804
[30]	validation_0-logloss:0.30576	validation_0-score:0.34478
[40]	validation_0-logloss:0.28351	validation_0-score:0.33777
[50]	validation_0-logloss:0.26621	validation_0-score:0.33049
[60]	validation_0-logloss:0.24972	validation_0-score:0.32068
[70]	validation_0-logloss:0.23756	validation_0-score:0.31568
[80]	validation_0-logloss:0.23914	validation_0-score:0.32651
[85]	validation_0-logloss:0.23182	validation_0-score:0.31874
[0]	validation_0-logloss:0.64945	validation_0-score:0.65037
[10]	validation_0-logloss:0.50940	validation_0-score:0.49254
[20]	validation_0-logloss:0.42152	validation_0-score:0.40336
[30]	validation_0-logloss:0.33503	validation_0-score:0.34393
[40]	validation_0-logloss:0.28126	validation_0-score:0.28654
[50]	validation_0-logloss:0.25297	validation_0-score:0.26668
[60]	validation_0-logloss:

In [14]:
from lightgbm import LGBMClassifier

lgb_params = {
    'objective': 'binary', # 'binary', 'multiclass'
    'metric': None, # 'auc', 'multi_logloss'
    'boosting_type': 'goss',
    'learning_rate': 0.1,
    'num_leaves': 5,
    'colsample_bytree': 0.8,
    'subsample': 0.1,
    'reg_alpha': 0, 
    'reg_lambda': 0.8,
    'n_jobs': -1,
    'is_unbalance':True, 
    'verbose': -1,
    'seed': 344,
    'n_estimators': 200,
    'early_stopping_round': 20,
    'gpu_use_dp': True,
    }

# lgb_params = {
#     'objective': 'binary', 
#     'metric': None, 
#     'boosting': 'goss',
#     'learning_rate': 0.0883447499631696,
#     'num_leaves': 4,
#     'colsample_bytree': 0.5014338346504184,
#     'subsample': 0.8486891010640193,
#     'reg_alpha': 3.264832774300416e-06, 
#     'reg_lambda': 8.605058359426325e-07,
#     'n_jobs': -1,
#     'is_unbalance':True, 
#     'verbose': -1,
#     'seed': 42,
#     'n_estimators': 500,
#     'early_stopping_round': 30,
# }

kf = KFold(n_splits=5)
cv_loss = []
for train_index, test_index in kf.split(cleaned_array):
    classifier = LGBMClassifier(**lgb_params)
    eval_set = [(cleaned_array[test_index], cleaned_class[test_index])]
    
    classifier.fit(cleaned_array[train_index], cleaned_class[train_index],
                   eval_set=eval_set, verbose=50)
    predict = classifier.predict_proba(cleaned_array[test_index])[:, 1]
    cv_loss.append(balanced_log_loss(cleaned_class[test_index].astype(np.int64), predict))
print(cv_loss)
print("CI: %0.2f (+/- %0.2f)" % (np.mean(cv_loss), np.std(cv_loss) * 2)) 

[50]	valid_0's binary_logloss: 0.169712
[100]	valid_0's binary_logloss: 0.112497
[150]	valid_0's binary_logloss: 0.0954708
[200]	valid_0's binary_logloss: 0.0807367
[50]	valid_0's binary_logloss: 0.273396
[100]	valid_0's binary_logloss: 0.233042




[150]	valid_0's binary_logloss: 0.200957
[50]	valid_0's binary_logloss: 0.26447
[50]	valid_0's binary_logloss: 0.247577
[100]	valid_0's binary_logloss: 0.1943
[50]	valid_0's binary_logloss: 0.223111
[100]	valid_0's binary_logloss: 0.159906




[150]	valid_0's binary_logloss: 0.13507
[200]	valid_0's binary_logloss: 0.132815
[0.14494029146164644, 0.3577257724840697, 0.3167148225846717, 0.27172811533511804, 0.23282767383441738]
CI: 0.26 (+/- 0.15)


In [15]:
import lightgbm as lgb

def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true.astype(np.int64))
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true.astype(np.int64))
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

inal_valid_predictions = {}
final_test_predictions = []
bs = []


kf = StratifiedKFold(n_splits=5).split(cleaned_array, cleaned_class)
cv_loss = []
for train_index, test_index in kf:
    
    w_train = cleaned_class[train_index]
    w_train[w_train==0] = 1.214
    w_train[w_train==1] = 5.667
    w_test = cleaned_class[test_index]
    w_test[w_test==0] = 1.214
    w_test[w_test==1] = 5.667
    
    train_dataset = lgb.Dataset(cleaned_array[train_index], cleaned_class[train_index], weight=w_train)
    eval_dataset  = lgb.Dataset(cleaned_array[test_index], cleaned_class[test_index],weight=w_test)
#     lgb_params = {
#         'objective': 'binary', 
#         'metric': None, 
#         'boosting': 'goss',
#         'learning_rate': 0.0883447499631696,
#         'num_leaves': 4,
#         'feature_fraction': 0.5014338346504184,
#         'bagging_fraction': 0.8486891010640193,
#         'lambda_l1': 3.264832774300416e-06, 
#         'lambda_l2': 8.605058359426325e-07,
#         'n_jobs': -1,
#         'is_unbalance':True, 
#         'verbose': -1,
#         'seed': 42,
#     }

    lgb_params = {
        'objective': 'binary', # 'binary', 'multiclass'
        'metric': None, # 'auc', 'multi_logloss'
        'boosting_type': 'goss',
        'learning_rate': 0.1,
        'num_leaves': 5,
        'colsample_bytree': 0.8,
        'subsample': 0.1,
        'reg_alpha': 0, 
        'reg_lambda': 0.8,
        'n_jobs': -1,
        'is_unbalance':True, 
        'verbose': -1,
        'seed': 344,
        'n_estimators': 200,
        'early_stopping_round': 20,
        'gpu_use_dp': True,
        }

    model = lgb.train(
                params = lgb_params,
                train_set = train_dataset,
                num_boost_round = 500,
                valid_sets = [train_dataset, eval_dataset],
                early_stopping_rounds = 30,
                verbose_eval = 10000,
            )

    preds_valid = model.predict(cleaned_array[test_index])
    #preds_test  = model.predict(Test[features])
    preds_valid = np.vstack([1 - preds_valid, preds_valid]).T
    #preds_test  = np.vstack([1 - preds_test, preds_test]).T
    
    #final_test_predictions.append(preds_test)
    #final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    blogloss = balance_logloss(cleaned_class[test_index], preds_valid)

    bs.append(blogloss)
    print(k, blogloss)
print('Balance Log loss:')
print(bs)
print(np.mean(bs), np.std(bs))

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[110]	training's binary_logloss: 0.050798	valid_1's binary_logloss: 0.127372




NameError: name 'k' is not defined