The approach contains the following components:
- Use an ensemble of TabPFN and (default) XGBoost
- Reweight the probabilites to match the balanced log loss used in this competition
- Use median nan imputation
- Use the time column from the training data, and use a time (max time in training) + 1 for test
- Use all four classes provided in greeks.Alpha and aggregate probabilites for the latter three classes, as they all correspond to different illnesses

In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/d/stajdi/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/d/stajdi/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/d/stajdi/pip-packages-icr/pip-packages
Processing /kaggle/input/d/stajdi/pip-packages-icr/pip-packages/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9
[0m

In [2]:
import numpy as np
import pandas as pd
import json

import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import random
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")



In [3]:
SEED = 344

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    print('Finish seeding with seed {}'.format(seed))
    
seed_everything(SEED)

Finish seeding with seed 344


In [4]:
# LOAD THE DATA

BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions'
# Import data directly as H2O frame
maindf = pd.read_csv(f'{BASE_DIR}/train.csv')
greeksdf = pd.read_csv(f'{BASE_DIR}/greeks.csv')
testdf = pd.read_csv(f'{BASE_DIR}/test.csv')

print(maindf.EJ.unique())
first_cat = maindf.EJ.unique()[0]
maindf.EJ = maindf.EJ.eq(first_cat).astype('int')
testdf.EJ = testdf.EJ.eq(first_cat).astype('int')

['B' 'A']


In [5]:
PATH_TRAIN = '/kaggle/input/icr-identify-age-related-conditions/train.csv'
PATH_TEST = '/kaggle/input/icr-identify-age-related-conditions/test.csv'
PATH_GREEKS = '/kaggle/input/icr-identify-age-related-conditions/greeks.csv'

train_list = pd.read_csv(PATH_TRAIN)
test_list = pd.read_csv(PATH_TEST)
greeks_list = pd.read_csv(PATH_GREEKS)

cleaned_list = train_list.copy()
cleaned_list.loc[cleaned_list.loc[:, "EJ"] == 'B', 'EJ'] = 1.0
cleaned_list.loc[cleaned_list.loc[:, "EJ"] == 'A', 'EJ'] = 0.0
#cleaned_list = cleaned_list.fillna(0)

greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'A', "Alpha"] = 0
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'B', "Alpha"] = 1
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'D', "Alpha"] = 2
greeks_list.loc[greeks_list.loc[:, "Alpha"] == 'G', "Alpha"] = 3
cleaned_greeks = np.array(greeks_list.loc[:, "Alpha"].values, np.int64)


id_array = cleaned_list.iloc[:,0].values
cleaned_array = np.array(cleaned_list.iloc[:,1:-1].values, np.float32)
cleaned_class = np.array(cleaned_list.iloc[:,-1].values, np.float32)

#cleaned_array = cleaned_list.drop(['Id', 'Class'], axis=1).astype('float32')
#cleaned_class = cleaned_list['Class'].astype('float32')

cleaned_array.shape, cleaned_class.shape

((617, 56), (617,))

In [6]:
cleaned_test_list = test_list.copy()
cleaned_test_list.loc[cleaned_test_list.loc[:, "EJ"] == 'B', 'EJ'] = 1.0
cleaned_test_list.loc[cleaned_test_list.loc[:, "EJ"] == 'A', 'EJ'] = 0.0
cleaned_test_list = cleaned_test_list.fillna(0)

test_id_array = cleaned_test_list.iloc[:,0].values
cleaned_test_array = np.array(cleaned_test_list.iloc[:,1:].values, np.float32)
cleaned_test_array.shape

(5, 56)

In [7]:
# Greeks contains time information that we can use, we just need to parse it to int / nan.

from datetime import date, datetime
times = greeksdf.Epsilon.copy()
times[greeksdf.Epsilon != 'Unknown'] = greeksdf.Epsilon[greeksdf.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeksdf.Epsilon == 'Unknown'] = np.nan

In [8]:
# Set predictor and target columns
target = 'Class'
predictors = [n for n in maindf.columns if n != target and n != 'Id']

# Model

In [9]:
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset
import torch.nn.functional as F

momentum = 0.4
track_running_stats = True
affine = True

class inputFC(nn.Module):
    def __init__(self, channels_in=56):
        super().__init__()

        self.seq = nn.Sequential(
            nn.Linear(channels_in, 16*16),
        )
        
    def forward(self, x):
        x = self.seq(x)
        return x.view(-1, 16, 16)

class CNN1D(nn.Module):
    def __init__(self, channels_out=4):
        super().__init__()

        self.seq = nn.Sequential(
            nn.Conv1d(16, 32, 3, padding=0),
            nn.BatchNorm1d(32, momentum=momentum, track_running_stats=track_running_stats, affine=affine),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2),
            
            nn.Conv1d(32, 64, 3, padding=0),
            nn.BatchNorm1d(64, momentum=momentum, track_running_stats=track_running_stats, affine=affine),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2),
            
#             nn.Conv1d(256, 1024, 3, padding=0),
#             nn.BatchNorm1d(1024, momentum=momentum, track_running_stats=track_running_stats, affine=affine),
#             nn.ReLU(inplace=True),
#             nn.MaxPool1d(2),

            #nn.Conv1d(256, 64, 1, padding=0),
            #nn.BatchNorm1d(64, momentum=momentum, track_running_stats=track_running_stats, affine=affine),
            #nn.ReLU(inplace=True),
            nn.Flatten(),
            nn.Linear(128, channels_out)
        )
        
    def forward(self, x):
        x = self.seq(x)
        return x
    
class NetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.seq = nn.Sequential(
            inputFC(),
            CNN1D()
        )
        
    def forward(self, x):
        return self.seq(x)

In [10]:
from sklearn.base import BaseEstimator
from torch.utils.data import TensorDataset, DataLoader

class CNNClassifier(BaseEstimator):
    def __init__(self, batchsize = 16, epochs=50, device = 'cuda'):
        self.clf = NetCNN().to(device)
        self.bs = batchsize
        self.n_eopchs = epochs
        self.device = device
        
    def fit(self, X, y):
        X = torch.tensor(X).to(self.device)
        y = torch.tensor(y).to(self.device)
        
        train_dataset = TensorDataset(X,y)
        loader = DataLoader(dataset=train_dataset,batch_size=self.bs,shuffle=True,num_workers=0,)
        
        optimizer = optim.Adam(self.clf.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss()
        
        for epoch in range(0, self.n_eopchs):
            for X, y in loader:
                X = X.to(self.device)
                y = y.to(self.device)

                outputs = self.clf(X)
                #outputs = model(imgs.unsqueeze(1))
                loss = loss_fn(outputs, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
    def predict_proba(self, X):
        X = torch.tensor(X).to(self.device)
        
        test_dataset = TensorDataset(X)
        loader = DataLoader(dataset=test_dataset,batch_size=self.bs, shuffle=False,num_workers=0,)
    
        p = []
        
        for X in loader:
            X = X[0].to(self.device)
            
            with torch.no_grad():
                outputs = self.clf(X)
                p.append(torch.nn.functional.softmax(outputs, dim = 1).to('cpu').numpy())
                
                
        
        p = np.concatenate([x for x in p], axis=0)
        return p
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        # we reweight the probs, since the loss is also balanced like this
        # our models out of the box optimize CE
        # with these changes they optimize balanced CE
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        return new_p / np.sum(new_p,axis=1,keepdims=1)

In [11]:
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from catboost import Pool, CatBoostClassifier
import xgboost
from tabpfn import TabPFNClassifier
#import tabpfn

xgb_params = {
    #'objective': 'multi:softprob',
    #'scale_pos_weight': 14,
    #'num_class': 4,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'max_depth': 7,
    'subsample': 0.5,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'gamma': 0,
    'reg_alpha': 0.5,
    'reg_lambda': 0.1,
    'seed': 344,
    }

class WeightedEns(BaseEstimator):
    def __init__(self):
        self.classifiers = [xgboost.XGBClassifier(**xgb_params), xgboost.XGBClassifier(), 
                            TabPFNClassifier(N_ensemble_configurations=64, device='cuda'), 
                            TabPFNClassifier(N_ensemble_configurations=24, device='cuda'),
                            CNNClassifier(batchsize = 16, epochs=100),]
        self.imp = SimpleImputer(missing_values=np.nan, strategy='median')
        #self.imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    
    def fit(self, X, y):
        cls, y = np.unique(y, return_inverse=True)
        self.classes_ = cls
        X = self.imp.fit_transform(X)
        for cl in self.classifiers:
            if cl==self.classifiers[2] or cl==self.classifiers[3]:
                cl.fit(X,y, overwrite_warning=True)
            else:
                cl.fit(X,y)
    
    def predict_proba(self, X):
        X = self.imp.transform(X)
        ps = np.stack([cl.predict_proba(X) for cl in self.classifiers])
        p = np.mean(ps,axis=0)
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        # we reweight the probs, since the loss is also balanced like this
        # our models out of the box optimize CE
        # with these changes they optimize balanced CE
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        return new_p / np.sum(new_p,axis=1,keepdims=1)
        #return p

In [12]:
pred_and_time = pd.concat((maindf[predictors], times), 1)

In [13]:
test_predictors = np.array(testdf[predictors])
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors),1)) + pred_and_time.Epsilon.max()+1),1)

In [14]:
# m = WeightedEns()
# m.fit(np.array(pred_and_time),np.array(greeksdf['Alpha']))
# p = m.predict_proba(test_pred_and_time)
# assert (m.classes_[0] == 'A')
# p = np.concatenate((p[:,:1],np.sum(p[:,1:],1,keepdims=True)), 1)
# result_df = pd.concat((testdf['Id'],pd.DataFrame(p, columns=('class_0', 'class_1'))),axis=1)
# result_df.to_csv('submission.csv',index=False)

In [15]:
ros = RandomOverSampler(random_state=344)
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

train_ros, target_ros = ros.fit_resample(cleaned_array, greeks.Alpha)
print('Original dataset shape')
print(greeks.Alpha.value_counts())
print('Resample dataset shape')
print(target_ros.value_counts())

num_targets_ros = np.array([0 if x == 'A' else 1 for x in target_ros])

Original dataset shape
A    509
B     61
G     29
D     18
Name: Alpha, dtype: int64
Resample dataset shape
B    509
A    509
D    509
G    509
Name: Alpha, dtype: int64


# CV training

In [16]:
from sklearn.model_selection import KFold,StratifiedKFold
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib

def balanced_log_loss(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    balanced_log_loss_score = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    #balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
    
    return balanced_log_loss_score

features = np.array(cleaned_array)
targets = np.array(greeksdf['Alpha'])
kf = StratifiedKFold(n_splits=5).split(cleaned_array, cleaned_greeks)
num_targets = np.array([0 if x == 'A' else 1 for x in targets])

# features = np.array(train_ros)
# targets = np.array(target_ros)
# kf = StratifiedKFold(n_splits=5).split(features, targets)
# num_targets = num_targets_ros

cv_loss = []
test_preds = []
classifier = WeightedEns()

for train_index, test_index in kf:
    
    classifier.fit(features[train_index], targets[train_index])

    p = classifier.predict_proba(features[test_index])
    p = np.sum(p[:,1:],1)
    
    p[p < 0.02] = 0.0
    p[p > 0.98] = 1.0
    
#     print(pd.DataFrame(p, num_targets[test_index].astype(np.int64)))
#     print(max(abs(p - num_targets[test_index].astype(np.int64))))
    
#     for p_index in range(0, len(p)):
#         if p[p_index] > 0.7 and p[p_index] <= 0.9:
#             p[p_index] = 0.9
#         elif p[p_index] > 0.9 and p[p_index] <= 0.98:
#             p[p_index] = 0.98
#         elif p[p_index] > 0.98:
#             p[p_index] = 1.0
#         elif p[p_index] < 0.3 and p[p_index] >= 0.1:
#             p[p_index] = 0.1
#         elif p[p_index] < 0.1 and p[p_index] >= 0.02:
#             p[p_index] = 0.02
#         elif p[p_index] < 0.02:
#             p[p_index] = 0.0
            
                


    cv_loss.append(balanced_log_loss(num_targets[test_index].astype(np.int64), p))
    
    p_test = classifier.predict_proba(cleaned_test_array)
    p_test = np.sum(p_test[:,1:],1)
    test_preds.append(p_test)

print(cv_loss)
print("CI: %0.2f (+/- %0.2f)" % (np.mean(cv_loss), np.std(cv_loss) * 2)) 

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
[0.20012646925828848, 0.2755579831710057, 0.1567009218868457, 0.1815986285754826, 0.1394537584864489]
CI: 0.19 (+/- 0.09)


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.array(cleaned_array), np.array(greeksdf['Alpha']), test_size=0.2, random_state=42)
classifier.fit(X_train, y_train)
p = classifier.predict_proba(X_test)
p = np.sum(p[:,1:],1)
num_targets = np.array([0 if x == 'A' else 1 for x in y_test])
balanced_log_loss(num_targets.astype(np.int64), p)

0.156000810332322

In [18]:
# from sklearn.metrics import make_scorer

# def balanced_log_loss_inverse(y_true, y_pred):
#     y_true = np.array([0 if x == 'A' else 1 for x in y_true])
#     y_true = y_true.astype(np.int64)
#     y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
#     y_pred = np.sum(y_pred[:,1:],1)
#     nc = np.bincount(y_true)
#     w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
#     balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
#     #balanced_log_loss_score = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    
    
#     return balanced_log_loss_score


# balanced_log_loss_scorer = make_scorer(balanced_log_loss_inverse, greater_is_better=False, needs_proba=True)

In [19]:
# from sklearn.model_selection import cross_val_score



# cv_scores = - cross_val_score(classifier, cleaned_array, np.array(greeksdf['Alpha']), cv = 5, scoring = balanced_log_loss_scorer)

# print(cv_scores)
# print(np.mean(cv_scores))

In [20]:
predict = np.mean(test_preds, axis=0)

predict[predict < 0.02] = 0.0
predict[predict > 0.98] = 1.0

# for p_index in range(0, len(predict)):
#     if predict[p_index] > 0.7 and predict[p_index] <= 0.9:
#             predict[p_index] = 0.9
#     elif predict[p_index] > 0.9 and predict[p_index] <= 0.98:
#             predict[p_index] = 0.98
#     elif predict[p_index] > 0.98:
#             predict[p_index] = 1.0
#     elif predict[p_index] < 0.3 and predict[p_index] >= 0.1:
#             predict[p_index] = 0.1
#     elif predict[p_index] < 0.1 and predict[p_index] >= 0.02:
#             predict[p_index] = 0.02
#     elif predict[p_index] < 0.02:
#             predict[p_index] = 0.0

submission = pd.DataFrame({"Id":test_id_array,
                          "class_0":1-predict,
                          "class_1": predict})
submission.to_csv("submission.csv", index=None)

In [21]:
pd.read_csv('submission.csv')

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
