In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputClassifier

import os
import warnings
warnings.filterwarnings('ignore')

# Framing as a binary classification problem

In this notebook I create a baseline model using XGBoost and framing the problem as a n-binary classification problems (where n=206 and is the total number of classes). I make use of the [MultiOutputClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html#sklearn.multioutput.MultiOutputClassifier) wrapper in sklearn.

This has the advantages that :
- You can use models capable only of binary classification
- It is easy to implement

But has the disadvantages that:
- You lose any correlation between labels which could be useful to the model
- You need to train *n* models and is therefore slow



Updates (started version 9)
- v9: 
    - dropped ctl_vehicle instances in-fold, kept in validation 


In [2]:
SEED = 42
NFOLDS = 5
DATA_DIR = '/kaggle/input/lish-moa/'
np.random.seed(SEED)

In [None]:
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')

test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

# drop id col
X = train.iloc[:,1:].to_numpy()
X_test = test.iloc[:,1:].to_numpy()
y = targets.iloc[:,1:].to_numpy() 

In [54]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist', colsample_bytree = 0.6522,
          gamma = 3.6975,
          learning_rate = 0.0503,
          max_delta_step = 2.0706,
          max_depth = 10,
          min_child_weight = 31.5800,
          n_estimators = 166,
          subsample = 0.8639))

clf = Pipeline([('encode', CountEncoder(cols=[0, 2]))
               ])

In [55]:
classifier

MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.6522,
                                              gamma=3.6975, gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=0.0503,
                                              max_delta_step=2.0706,
                                              max_depth=10,
                                              min_child_weight=31.58,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=166, n_jobs=None,
                  


## Train the model

Framing this problem as a binary classification problem has the disadvantage that you need to train as many models as you have classes. For this problem this means training 206 models per fold, for the large number of features included in this dataset this may take a long time...

In [57]:
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
oof_losses = []
kf = KFold(n_splits=NFOLDS)
for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print('Starting fold: ', fn)
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    
    # drop where cp_type==ctl_vehicle (baseline)
#     ctl_mask = X_train[:,0]=='ctl_vehicle'
#     X_train = X_train[~ctl_mask,:]
#     y_train = y_train[~ctl_mask]
    X_train = clf.fit_transform(X_train)
    X_val = clf.fit_transform(X_val)
    classifier.fit(X_train, y_train)
    val_preds = classifier.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_losses.append(loss)
    preds = classifier.predict_proba(clf.fit_transform(X_test))
    preds = np.array(preds)[:,:,1].T # take the positive class
    test_preds += preds / NFOLDS
    
print(oof_losses)
print('Mean OOF loss across folds', np.mean(oof_losses))
print('STD OOF loss across folds', np.std(oof_losses))

Starting fold:  0
Starting fold:  1
Starting fold:  2
Starting fold:  3
Starting fold:  4
[0.016903024263565266, 0.01696713847699173, 0.016811488247029233, 0.016938618775522805, 0.017121382490998215]
Mean OOF loss across folds 0.01694833045082145
STD OOF loss across folds 0.00010116670048713858


In [58]:
# set control train preds to 0
# control_mask = train['cp_type']=='ctl_vehicle'
# oof_preds[control_mask] = 0

print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(oof_preds)))

OOF log loss:  0.016948323017206987


## Analysis of OOF preds


In [59]:
# create the submission file
sub.iloc[:,1:] = test_preds
# sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001889,0.001874,0.001886,0.011166,0.017545,0.003605,0.002132,0.006043,0.00188,...,0.001866,0.001789,0.002069,0.002376,0.002842,0.001869,0.002267,0.001894,0.001643,0.00191
1,id_001897cda,0.001889,0.001874,0.001886,0.004713,0.004849,0.003193,0.001888,0.005145,0.00188,...,0.001866,0.001917,0.001921,0.001568,0.004852,0.001869,0.006512,0.001887,0.002217,0.001927
2,id_002429b5b,0.001889,0.001874,0.001886,0.010373,0.015547,0.002896,0.002872,0.005845,0.00188,...,0.001866,0.00178,0.002111,0.00432,0.002789,0.001869,0.001654,0.001913,0.00215,0.001901
3,id_00276f245,0.001889,0.001874,0.001886,0.008343,0.008715,0.002856,0.001989,0.00379,0.00188,...,0.001866,0.001943,0.002016,0.004372,0.002627,0.001869,0.002118,0.001896,0.002058,0.001928
4,id_0027f1083,0.001889,0.001874,0.001886,0.011782,0.015249,0.002632,0.003039,0.004161,0.00188,...,0.001866,0.00178,0.002164,0.003184,0.002535,0.001869,0.00204,0.001903,0.00166,0.001871


## Save the model

In [60]:
import pickle
pickle.dump((clf), open('countEncoder.pkl', 'wb'))

In [62]:
import joblib
joblib.dump(classifier, 'xgboost_model.joblib')

['xgboost_model.joblib']

In [None]:
model = joblib.load('/kaggle/working/xgboost_model.joblib')
encode = pickle.load(open('/kaggle/working/countEncoder.pkl', 'rb'))

X_test = encode.fit_transform(X_test)
preds = model.predict_proba(X_test)
preds = np.array(preds)[:,:,1].T
preds

In [65]:
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub.iloc[:,1:] = preds
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001884,0.001847,0.001893,0.009748,0.015314,0.003216,0.002201,0.003452,0.001902,...,0.001888,0.001879,0.002035,0.002729,0.003264,0.001866,0.001942,0.001882,0.001688,0.001881
1,id_001897cda,0.001884,0.001847,0.001893,0.004502,0.004936,0.003001,0.002025,0.00386,0.001902,...,0.001888,0.001879,0.001999,0.001211,0.007435,0.001866,0.005304,0.001882,0.002681,0.001969
2,id_002429b5b,0.001884,0.001847,0.001893,0.005779,0.017868,0.002473,0.002575,0.004883,0.001902,...,0.001888,0.001879,0.001998,0.004085,0.002261,0.001866,0.001421,0.001882,0.001895,0.001833
3,id_00276f245,0.001884,0.001847,0.001893,0.008968,0.007837,0.002767,0.001745,0.002843,0.001902,...,0.001888,0.001879,0.002157,0.004555,0.002139,0.001866,0.002643,0.001882,0.001893,0.001969
4,id_0027f1083,0.001884,0.001847,0.001893,0.008037,0.017379,0.002476,0.002485,0.003389,0.001902,...,0.001888,0.001879,0.002097,0.00323,0.002419,0.001866,0.001799,0.001882,0.001844,0.001833
