In [1]:
!pip install xgboost sklearn 



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV #RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
#from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
# from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
import logging
import time
from tqdm import tqdm


In [3]:
## configuration of logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')

file_handler = logging.FileHandler('hp_search_cl_kbest_18.log')
file_handler.setFormatter(formatter)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

logger.info(f'start @ {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}')
start = time.perf_counter()

2021-12-08 11:24:16,864:INFO:start @ 2021-12-08 11:24:16


In [4]:
X_data = pd.read_csv('train_features.csv')
y_data = pd.read_csv('train_targets_scored.csv')

In [5]:
# some labels are extremly uncommon. as such the samples belonging to these features will be oversampled
def oversampling(X_data, y_data):
    # new_y = X_data.copy()
    # new_x = y_data.copy()
    oversampling = list()
    for label in tqdm(y_data.columns[1:]):
        if y_data[label].sum()<20:
            oversampling.extend(list(y_data.loc[y_data[label]==1, 'sig_id']))
    oversampling_x = X_data[X_data['sig_id'].isin(oversampling)]
    oversampling_y = y_data[y_data['sig_id'].isin(oversampling)]
    new_x = pd.concat([X_data, oversampling_x, oversampling_x, oversampling_x, oversampling_x]).reset_index(drop=True)
    new_y = pd.concat([y_data, oversampling_y, oversampling_y, oversampling_y, oversampling_y]).reset_index(drop=True)
    return new_x, new_y

# print(f'length data before oversampling: {len(X_data)}/{len(y_data)}')
# X_data, y_data = oversampling(X_data, y_data)
# print(f'length data after oversampling: {len(X_data)}/{len(y_data)}')

In [6]:
# delete all unneeded columns
X_data.drop(columns=['sig_id', 'cp_type'], inplace=True) 
y_data.drop(columns=['sig_id'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                    test_size=0.2, 
                                                    random_state=174)

In [7]:
# Onehot encoding cp_type
one_hot = OneHotEncoder()

In [8]:
kBest = SelectKBest(k=400)

In [9]:
preproc_transformer = ColumnTransformer([
    ('onehot', one_hot, ['cp_dose']), 
    #('pca_genes', pca_genes, genes),
    #('pca_cells', pca_cells, cell_lines)],
], remainder='passthrough')


In [10]:
## XGBoost MODEL
model = xgb.XGBClassifier(tree_method='gpu_hist', 
                            objective='binary:logistic',
                            eval_metric='logloss',
                            eta=0.01,
                            colsample_bytree= 0.55, 
                            subsample= 0.95,
                            scale_pos_weight=1,
                            max_depth=8,
                            min_child_weight=1,
                            reg_alpha=1e-05,
                            reg_lambda=0.5,
                            gamma=0.25,
                            n_estimators=2000,
                            use_label_encoder=False)

logger.info("xgb.XGBClassifier(tree_method='gpu_hist', objective='binary:logistic', eval_metric='logloss', eta=0.1," \
            "colsample_bytree= 0.55, subsample= 0.95, scale_pos_weight=1, max_depth=7, min_child_weight=1, reg_alpha=1e-05," \
            "reg_lambda=0.5, gamma=0.25, n_estimators=2000, use_label_encoder=False)")

2021-12-08 11:24:20,403:INFO:xgb.XGBClassifier(tree_method='gpu_hist', objective='binary:logistic', eval_metric='logloss', eta=0.1,colsample_bytree= 0.55, subsample= 0.95, scale_pos_weight=1, max_depth=7, min_child_weight=1, reg_alpha=1e-05,reg_lambda=0.5, gamma=0.25, n_estimators=2000, use_label_encoder=False)


In [11]:
cache_dir = '.' # cache transformers in the current directory
preproc_pipe = Pipeline([
    ('preproc', preproc_transformer),
], memory=cache_dir)

kBest_pipe = Pipeline([
    ('kbest', kBest)
], memory=cache_dir)

In [12]:
multi_pipe = OneVsRestClassifier(Pipeline([
    ('kbest', kBest_pipe),
    ('model', model)
]))


final_pipe = Pipeline([
    ('preproc', preproc_pipe),
    ('multi', multi_pipe)
])


In [13]:

params = {#'multi__estimator__model__subsample' : [0.6, 0.75],
          # 'multi__estimator__model__max_depth' : [6,7,8], #range(3,10,2)
          # 'multi__estimator__model__min_child_weight' : [1,2,3], range(1,6,2)
          # 'multi__estimator__model__max_delta_step' : [1, 8, 10], #might help in logistic regression when class is extremely imbalanced.
          # 'multi__estimator__model__gamma' : [0.1, 0.15, 0.2, 0.25, 0.3], #[i/10.0 for i in range(0,5)]
         # 'multi__estimator__model__subsample' : [i/100.0 for i in range(85,100,5)],#[i/10.0 for i in range(6,10)],
         # 'multi__estimator__model__colsample_bytree' : [i/100.0 for i in range(55,80,5)], #[i/10.0 for i in range(6,10)],
         'multi__estimator__model__reg_alpha':[5e-6, 1e-5, 5e-5, 1e-4], #[1e-5, 1e-2, 0.1, 1, 100],
         'multi__estimator__model__reg_lambda':[0.01,0.05,0.1, 0.5],#[1e-5, 1e-2, 0.1, 1, 100],
}


In [14]:
# logger.info('Training model')
# search = GridSearchCV(final_pipe, 
#                             params, 
#                             cv=4, 
#                             #n_iter=30, 
#                             scoring='neg_log_loss', 
#                             verbose=4, 
#                             n_jobs=-1)
# search.fit(X_train, y_train)

In [15]:
logger.info('Training model')
# search = GridSearchCV(final_pipe, 
#                             params, 
#                             cv=4, 
#                             #n_iter=30, 
#                             scoring='neg_log_loss', 
#                             verbose=4, 
#                             n_jobs=-1)
final_pipe.fit(X_train, y_train)

2021-12-08 11:24:20,424:INFO:Training model


Pipeline(steps=[('preproc',
                 Pipeline(memory='.',
                          steps=[('preproc',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('onehot',
                                                                   OneHotEncoder(),
                                                                   ['cp_dose'])]))])),
                ('multi',
                 OneVsRestClassifier(estimator=Pipeline(steps=[('kbest',
                                                                Pipeline(memory='.',
                                                                         steps=[('kbest',
                                                                                 SelectKBest(k=400))])),
                                                               ('model',
                                                                XGBClassifier(base_score=None,
                 

In [16]:
# for k,i in params.items():
#     logger.info(f'tested hyperparameters: {k}: {i}')

# logger.info('Training model finished')

In [17]:
y_train_pred = final_pipe.predict_proba(X_train)
y_test_pred = final_pipe.predict_proba(X_test)

In [18]:
logger.info(f'training log loss: {log_loss(y_train, y_train_pred)}')
logger.info(f'test log loss : {log_loss(y_test, y_test_pred)}')
# logger.info(f'Best Params: {search.best_params_}')
# logger.info(f'CV results: {search.cv_results_}')
end = time.perf_counter()
logger.info(f'runtime: {round((end-start)/60,1)} m')

2021-12-08 11:49:14,088:INFO:training log loss: 0.17792004848370957
2021-12-08 11:49:14,191:INFO:test log loss : 2.4962081165381416
2021-12-08 11:49:14,192:INFO:runtime: 25.0 m
