In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier 
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
import logging
import time

In [19]:
## configuration of logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')

file_handler = logging.FileHandler('submission_1.log')
file_handler.setFormatter(formatter)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

logger.info(f'start @ {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
start = time.perf_counter()

2021-12-03 10:45:17,817:INFO:start @ 2021-12-03 10:45:17
2021-12-03 10:45:17,817:INFO:start @ 2021-12-03 10:45:17
2021-12-03 10:45:17,817:INFO:start @ 2021-12-03 10:45:17
2021-12-03 10:45:17,817:INFO:start @ 2021-12-03 10:45:17


In [20]:
## PREPROCESSING
X_data = pd.read_csv('../input/lish-moa/train_features.csv')
y_data = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
X_test = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

# delete all unneeded columns
X_data.drop(columns=['sig_id', 'cp_type'], inplace=True)
y_data.drop(columns=['sig_id'], inplace=True)
X_test.drop(columns=['sig_id', 'cp_type'], inplace=True)

X_train = X_data.copy()
y_train = y_data.copy()

In [21]:
# Onehot encoding cp_type
one_hot = OneHotEncoder(drop='if_binary')

preproc_transformer = ColumnTransformer([
    ('onehot', one_hot, ['cp_dose'])], 
    remainder='passthrough')

In [22]:
## XGBoost MODEL
multioutputclassifier = OneVsRestClassifier(xgb.XGBClassifier(max_depth=6, 
                                                              tree_method='hist', 
                                                              scale_pos_weight=2,
                                                              min_child_weight=1,
                                                              max_delta_step=8,
                                                              eta=0.1,
                                                              objective='binary:logistic',
                                                              eval_metric='logloss', 
                                                              subsample = 0.75,
                                                              use_label_encoder=False))


In [23]:
pipe = Pipeline([
    ('preproc', preproc_transformer),
    ('xgb_model', multioutputclassifier)
])


In [24]:
logger.info('Training model')
pipe.fit(X_train, y_train)
logging.info('Training model finished')

2021-12-03 10:45:23,141:INFO:Training model
2021-12-03 10:45:23,141:INFO:Training model
2021-12-03 10:45:23,141:INFO:Training model
2021-12-03 10:45:23,141:INFO:Training model


In [30]:
y_train_pred = pipe.predict_proba(X_train)
y_test_pred = pipe.predict_proba(X_test)

In [37]:
logger.info(f'training log loss: {log_loss(y_train, y_train_pred)}')
end = time.perf_counter()
logger.info(f'runtime: {round((end-start)/60,1)} m')

y_test_pred = pd.DataFrame(y_test_pred, columns=sample_submission.columns[1:])
y_test_pred = pd.concat([sample_submission['sig_id'], y_test_pred], axis=1)
y_test_pred.to_csv('submission.csv', index=False)

2021-12-03 11:56:26,891:INFO:training log loss: 0.2532252875661431
2021-12-03 11:56:26,891:INFO:training log loss: 0.2532252875661431
2021-12-03 11:56:26,891:INFO:training log loss: 0.2532252875661431
2021-12-03 11:56:26,891:INFO:training log loss: 0.2532252875661431
2021-12-03 11:56:26,897:INFO:runtime: 71.2 m
2021-12-03 11:56:26,897:INFO:runtime: 71.2 m
2021-12-03 11:56:26,897:INFO:runtime: 71.2 m
2021-12-03 11:56:26,897:INFO:runtime: 71.2 m
