## Importing libraries

In [8]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
[0m

In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
import imblearn
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler,ADASYN,SMOTE
from imblearn.under_sampling import RandomUnderSampler,EditedNearestNeighbours,RepeatedEditedNearestNeighbours,AllKNN
import xgboost
from tabpfn import TabPFNClassifier
from tqdm.notebook import tqdm

In [10]:
import warnings
warnings.filterwarnings("ignore")

## Data Reading and pre-processing

In [30]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      557 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      615 non-null    float64
 17  CC      614 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [12]:
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [13]:
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']
x= train[predictor_columns]
y = train['Class']

In [14]:
from sklearn.model_selection import StratifiedKFold as sKF, KFold as KF, GridSearchCV
cv_outer = sKF(n_splits = 10, shuffle=True, random_state=42)
cv_med = sKF(n_splits = 7, shuffle=True, random_state=42)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)

## metric for the task

In [15]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # Implements the Evaluation equation with w_0 = w_1 = 1.
    # Calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # Calculate the average log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    # return the (not further weighted) average of the averages
    return (log_loss_0 + log_loss_1)/2

## model

In [16]:
class Ensemble():
    def __init__(self):
        self.classifiers =[xgboost.XGBClassifier(),TabPFNClassifier(N_ensemble_configurations=64, device = 'cuda:0')]
    
    def fit(self,X,y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        for classifier in self.classifiers:
            if classifier==self.classifiers[1]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 

In [17]:
def training(x,y,y_ros):
    outer_results = list()
    best_loss = np.inf
    fold = 0
    folds = 10
    for train_idx,val_idx in tqdm(cv_outer.split(x,y_ros), total = folds):
        fold+=1
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_ros.iloc[train_idx], y.iloc[val_idx]
        
        model = Ensemble()
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:,:1]
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            if p0[i]>=0.5:
                y_p[i]= False
            else :
                y_p[i]=True
        y_p = y_p.astype(int)
        loss = balanced_log_loss(y_val,y_p)

        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.5f, fold = %.1f' % (loss,fold))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return np.mean(outer_results), best_model
    

In [18]:
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

In [19]:
# greeks.Alpha = greeks.Alpha.map({'A' :0, 'B':1,'G' :2, 'D' :3})

In [20]:
train_pred_and_time = pd.concat((train[predictor_columns], times), axis=1)
train_pred_and_time['Class'] = train.Class
test_predictors = test[predictor_columns]
first_category = test_predictors.EJ.unique()[0]
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

## sampling to balance the skewed dataset

In [21]:
ros = ADASYN(random_state=42)
smot = SMOTE(random_state=42)
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()
allknn = AllKNN()


si = SimpleImputer(missing_values=np.nan, strategy='mean')
train_pred_and_time11 = si.fit_transform(train_pred_and_time)
train_ros, y_ros = ros.fit_resample(train_pred_and_time11, greeks.Alpha)
print('Original dataset shape')
print( greeks.Alpha.value_counts(normalize=True))
print("")
print('Resample dataset shape')
print( y_ros.value_counts(normalize=True))

Original dataset shape
A    0.824959
B    0.098865
G    0.047002
D    0.029173
Name: Alpha, dtype: float64

Resample dataset shape
G    0.253077
A    0.250615
D    0.249631
B    0.246677
Name: Alpha, dtype: float64


In [22]:
df = pd.DataFrame(train_ros,columns =train_pred_and_time.columns)

In [23]:
x_ros = df.drop(['Class'],axis=1)
y_ = df.Class

## training

In [24]:
loss,m = training(x_ros,y_,y_ros)

  0%|          | 0/10 [00:00<?, ?it/s]

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
best_model_saved
>val_loss=0.00000, fold = 1.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.11361, fold = 2.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.00000, fold = 3.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.00000, fold = 4.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.00000, fold = 5.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.00000, fold = 6.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.11361, fold = 7.0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
>val_loss=0.00000, fo

In [25]:
m.fit(df.drop(['Class'],axis=1),y_ros)

## predictions

In [26]:
y_pred = m.predict_proba(test_pred_and_time)
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:,:1]
p0[p0 > 0.86] = 1
p0[p0 < 0.14] = 0

In [27]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [28]:
submission_df = pd.read_csv('submission.csv')
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
