In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputClassifier

import os
import warnings
warnings.filterwarnings('ignore')

# Framing as a binary classification problem

In this notebook I create a baseline model using XGBoost and framing the problem as a n-binary classification problems (where n=206 and is the total number of classes). I make use of the [MultiOutputClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html#sklearn.multioutput.MultiOutputClassifier) wrapper in sklearn.

This has the advantages that :
- You can use models capable only of binary classification
- It is easy to implement

But has the disadvantages that:
- You lose any correlation between labels which could be useful to the model
- You need to train *n* models and is therefore slow




In [None]:
SEED = 42
NFOLDS = 5

np.random.seed(SEED)

In [None]:
train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
targets = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
sub = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

# drop id col
X = train.iloc[:,1:].to_numpy()
X_test = test.iloc[:,1:].to_numpy()
y = targets.iloc[:,1:].to_numpy() 

In [None]:
clf = Pipeline([('encode', CountEncoder(cols=[0, 2])),
                ('classify', MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist')))
               ])

In [None]:
params = {'classify__estimator__colsample_bytree': 0.652231655518253,
          'classify__estimator__gamma': 3.6975211709521023,
          'classify__estimator__learning_rate': 0.05033414197773552,
          'classify__estimator__max_delta_step': 2.070593162427692,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.579959348704868,
          'classify__estimator__n_estimators': 166,
          'classify__estimator__subsample': 0.8638628715886625,
          'encode__min_group_size': 0.4160029192647806}

clf.set_params(**params)


## Train the model

Framing this problem as a binary classification problem has the disadvantage that you need to train as many models as you have classes. For this problem this means training 206 models per fold, for the large number of features included in this dataset this may take a long time...

In [None]:
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
kf = KFold(n_splits=NFOLDS)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold: ', fn)
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    clf.fit(X_train, y_train)
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / NFOLDS

In [None]:
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))

## Analysis of OOF preds


In [None]:
# create the submission file
sub.iloc[:,1:] = test_preds
sub.to_csv('submission.csv', index=False)