# Mechanisms-of-Action-Prediction-targetencoding-featureselection-lightGBM

## debug

In [31]:
debug = True

## import libraries

In [32]:
import pandas as pd
import numpy as np
import copy
from sklearn.metrics import log_loss

import lightgbm as lgb
#import pickle

## load data

In [33]:
train_features = pd.read_csv("../input/lish-moa/train_features.csv")
train_targets_scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
test_features = pd.read_csv("../input/lish-moa/test_features.csv")
submission = pd.read_csv("../input/lish-moa/sample_submission.csv")

## preprocess

In [34]:
prep_df = pd.concat([train_features, test_features])

In [35]:
import category_encoders as ce

object_cols = ["cp_type", "cp_dose"]
ce_oe = ce.OrdinalEncoder(cols=object_cols, handle_unknown='impute')
prep_df = ce_oe.fit_transform(prep_df)
train_features = prep_df.iloc[:len(train_features),:]
test_features = prep_df.iloc[len(train_features):,:]

In [36]:
rename_dict = {
    'fungal_1,3-beta-d-glucan_synthase_inhibitor': 'fungal_1_3-beta_d_glucan_synthase_inhibitor',
    'glutathione_reductase_(nadph)_activators': 'glutathione_reductase_nadph_activators',
    'h+_k+-atpase_inhibitor': 'h_k_atpase_inhibitor',
    'indoleamine_2,3-dioxygenase_inhibitor': 'indoleamine_2_3_dioxygenase_inhibitor',
    'mitochondrial_na+_ca2+_exchanger_antagonist': 'mitochondrial_na_ca2_exchanger_antagonist',
    'nociceptin_orphanin_fq_(nop)_receptor_antagonist': 'nociceptin_orphanin_fq_nop_receptor_antagonist',
    'sars_coronavirus_3c-like_protease_inhibitor': 'sars_coronavirus_3c_like_protease_inhibitor',
    'selective_estrogen_receptor_modulator_(serm)': 'selective_estrogen_receptor_modulator_serm',
    'selective_serotonin_reuptake_inhibitor_(ssri)': 'selective_serotonin_reuptake_inhibitor_ssri',
    'sterol_regulatory_element_binding_protein_(srebp)_inhibitor': 'sterol_regulatory_element_binding_protein_srebp_inhibitor'
}

In [37]:
train_targets_nonscored.rename(columns=rename_dict, inplace=True)

In [38]:
def target_encoder(
        train_X: pd.DataFrame,
        train_y: pd.DataFrame,
        test: pd.DataFrame,
        column_list: list
) -> pd.DataFrame:
        train_X_te = copy.deepcopy(train_X)
        test_X_te = copy.deepcopy(test)
        if len(column_list) > 0:
            print("categorical features:"+ str(column_list))
        else:
            print("No categorical features")
        for c in column_list:
            data_tmp = pd.DataFrame({c: train_X_te[c], "target": train_y})
            target_mean = data_tmp.groupby(c)["target"].mean()
            """ print(target_mean)"""
            test_X_te[c] = test_X_te[c].map(target_mean).astype(float)

            tmp = np.repeat(np.nan, train_X.shape[0])
            kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
            for idx_1, idx_2 in kf_encoding.split(train_X_te):
                target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()

                tmp[idx_2] = train_X_te[c].iloc[idx_2].map(target_mean)

            train_X_te[c] = tmp

        return train_X_te, test_X_te

## train

In [39]:
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import KFold
FOLD_NUM = 5
kf = KFold(n_splits=5, shuffle=True, random_state=0)
target_cols_scored = [c for c in train_targets_scored.columns if c != "sig_id"]
target_cols_nonscored = [c for c in train_targets_nonscored.columns if c != "sig_id"]
oof_scored = train_targets_scored.copy()
test_targets_nonscored = pd.DataFrame()
test_targets_nonscored["sig_id"] = test_features["sig_id"]
num_round = 10000

In [40]:
params = {
#'boosting_type': 'gbdt',
'objective': 'binary',
#'lambda_l1': 0.001, 
#'lambda_l2': 0.001,
'num_leaves': 24, 
'max_depth': 5,
#'feature_fraction': 0.4,
#'subsample': 0.4, 
#'min_child_samples': 10,
'learning_rate': 0.01,
#'num_iterations': 100, #700
#'early_stopping_rounds': 100,
'random_state': 42}

In [41]:
def run_lgbm(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    target_col: str):
    
    X_train = X_train.drop(["sig_id"], axis=1)
    y_train = y_train[target_col]
    X_test = X_test.drop(["sig_id"], axis=1)

    y_preds = []
    models = []
    oof_train = np.zeros((len(X_train),))

    for fold_id, (train_index, valid_index) in enumerate(kf.split(X_train)):
        X_tr = X_train.loc[train_index, :]
        X_val = X_train.loc[valid_index, :]
        y_tr = y_train[train_index]
        y_val = y_train[valid_index]

        lgb_train = lgb.Dataset(X_tr,
                                y_tr,
                                categorical_feature=object_cols)

        lgb_eval = lgb.Dataset(X_val,
                               y_val,
                               reference=lgb_train,
                               categorical_feature=object_cols)

        model = lgb.train(params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_eval],
                          verbose_eval=10,
                          num_boost_round=1000,
                          early_stopping_rounds=10)


        oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
        
        #y_pred = model.predict(X_test, num_iteration=model.best_iteration)

        #y_preds.append(y_pred)
        #models.append(model)
        
        if debug == False:
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            y_preds.append(y_pred)
            models.append(model)
            #file = "trained_lgbm" + str(fold_id) + ".pkl"
            #pickle.dump(model, open(file, "wb"))
            model.save_model("trained_lgbm" + str(fold_id) + ".tc")
            del model

            return oof_train, sum(y_preds) / len(y_preds), y_preds
        
        else:
            return oof_train

## train nonscored

In [42]:
if debug == False:
    for target_col in target_cols_nonscored:
        train_X = copy.deepcopy(train_features)
        test_X = copy.deepcopy(test_features)
        train_X, test_X = target_encoder(train_X, train_targets_nonscored[target_col], test_X, object_cols)
    
        _, _, _preds  = run_lgbm(train_X, test_X, train_targets_nonscored, target_col)
        test_targets_nonscored[target_col] = _preds

## merge df

In [43]:
train_features = pd.concat([train_features, train_targets_nonscored.drop(["sig_id"], axis=1)], axis=1)
test_features = pd.concat([train_features, test_targets_nonscored.drop(["sig_id"], axis=1)], axis=1)

In [44]:
train_features

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor
0,id_000644bb2,1,24,1,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,1,72,1,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,1,48,1,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,1,48,1,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,1,72,2,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,1,24,2,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,...,0,0,0,0,0,0,0,0,0,0
23810,id_fffb70c0c,1,24,2,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,...,0,0,0,0,0,0,0,0,0,0
23811,id_fffc1c3f4,2,48,2,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,...,0,0,0,0,0,0,0,0,0,0
23812,id_fffcb9e7c,1,24,1,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,...,0,0,0,0,0,0,0,0,0,0


## train scored

In [46]:
for target_col in target_cols_scored:
    train_X = copy.deepcopy(train_features)
    test_X = copy.deepcopy(test_features)
    train_X, test_X = target_encoder(train_X, train_targets_scored[target_col], test_X, object_cols)
    if debug == False:
        _oof, _preds, _ = run_lgbm(train_X, train_targets_scored, test_X, target_col)
        oof_scored[target_col] = _oof
        submission[target_col] = _preds
    else:
        _oof = run_lgbm(train_X, train_targets_scored, test_X, target_col)
        oof_scored[target_col] = _oof

categorical features:['cp_type', 'cp_dose']




[LightGBM] [Info] Number of positive: 13, number of negative: 19038
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222460
[LightGBM] [Info] Number of data points in the train set: 19051, number of used features: 922




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000682 -> initscore=-7.289243
[LightGBM] [Info] Start training from score -7.289243
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.00236524	valid_1's binary_logloss: 0.00686287
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.00482727	valid_1's binary_logloss: 0.00681477
categorical features:['cp_type', 'cp_dose']
[LightGBM] [Info] Number of positive: 15, number of negative: 19036
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222460
[LightGBM] [Info] Number of data points in the train set: 19051, number of used features: 922
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000787 -> initscore=-7.146037
[LightGBM] [Info] Start training from score -7.146037
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.00364074	valid_1's binary_logloss: 0.00394242
[20]	training's binary_logloss: 0.00

[10]	training's binary_logloss: 0.0447976	valid_1's binary_logloss: 0.0464188


KeyboardInterrupt: 

In [None]:
scores = []
for target_col in target_cols_scored:
    scores.append(log_loss(train_targets_scored[target_col], oof_scored[target_col]))
print(np.mean(scores))

## submit

In [None]:
if debug == False:
    submission.to_csv("submission.csv", index=False)
    submission.head()