# Mechanisms-of-Action-Prediction-targetencoding-lightGBM

## debug

In [5]:
debug = True

## import libraries

In [6]:
import pandas as pd
import numpy as np
import copy
from sklearn.metrics import log_loss

import lightgbm as lgb

## load data

In [7]:
train_features = pd.read_csv("../input/lish-moa/train_features.csv")
train_targets_scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
test_features = pd.read_csv("../input/lish-moa/test_features.csv")
submission = pd.read_csv("../input/lish-moa/sample_submission.csv")

## preprocess

In [8]:
prep_df = pd.concat([train_features, test_features])

In [9]:
import category_encoders as ce

object_cols = ["cp_type", "cp_dose"]
ce_oe = ce.OrdinalEncoder(cols=object_cols, handle_unknown='impute')
prep_df = ce_oe.fit_transform(prep_df)
train_features = prep_df.iloc[:len(train_features),:]
test_features = prep_df.iloc[len(train_features):,:]

In [10]:
def target_encoder(
        train_X: pd.DataFrame,
        train_y: pd.DataFrame,
        test: pd.DataFrame,
        column_list: list
) -> pd.DataFrame:
        train_X_te = copy.deepcopy(train_X)
        test_X_te = copy.deepcopy(test)
        if len(column_list) > 0:
            print("categorical features:"+ str(column_list))
        else:
            print("No categorical features")
        for c in column_list:
            data_tmp = pd.DataFrame({c: train_X_te[c], "target": train_y})
            target_mean = data_tmp.groupby(c)["target"].mean()
            """ print(target_mean)"""
            test_X_te[c] = test_X_te[c].map(target_mean).astype(float)

            tmp = np.repeat(np.nan, train_X.shape[0])
            kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
            for idx_1, idx_2 in kf_encoding.split(train_X_te):
                target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()

                tmp[idx_2] = train_X_te[c].iloc[idx_2].map(target_mean)

            train_X_te[c] = tmp

        return train_X_te, test_X_te

## train models and prediction

In [11]:
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import KFold
FOLD_NUM = 5
kf = KFold(n_splits=5, shuffle=True, random_state=0)
target_cols = [c for c in train_targets_scored.columns if c != "sig_id"]
oof = train_targets_scored.copy()
num_round = 10000

In [12]:
params = {
#'boosting_type': 'gbdt',
'objective': 'binary',
#'lambda_l1': 0.001, 
#'lambda_l2': 0.001,
'num_leaves': 50, 
'max_depth': 6,
#'feature_fraction': 0.4,
#'subsample': 0.4, 
#'min_child_samples': 10,
'learning_rate': 0.01,
#'num_iterations': 100, #700
'early_stopping_rounds': 100,
'random_state': 42}

In [13]:
def run_lgbm(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    target_col: str):
    
    X_train = X_train.drop(["sig_id"], axis=1)
    y_train = train_targets_scored[target_col]
    X_test = X_test.drop(["sig_id"], axis=1)

    y_preds = []
    models = []
    oof_train = np.zeros((len(X_train),))

    for fold_id, (train_index, valid_index) in enumerate(kf.split(X_train)):
        X_tr = X_train.loc[train_index, :]
        X_val = X_train.loc[valid_index, :]
        y_tr = y_train[train_index]
        y_val = y_train[valid_index]

        lgb_train = lgb.Dataset(X_tr,
                                y_tr,
                                categorical_feature=object_cols)

        lgb_eval = lgb.Dataset(X_val,
                               y_val,
                               reference=lgb_train,
                               categorical_feature=object_cols)

        model = lgb.train(params,
                          lgb_train,
                          valid_sets=[lgb_train, lgb_eval],
                          verbose_eval=10,
                          num_boost_round=1000,
                          early_stopping_rounds=10)


        oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
        
        y_pred = model.predict(X_test, num_iteration=model.best_iteration)

        y_preds.append(y_pred)
        models.append(model)

    return oof_train, sum(y_preds) / len(y_preds)

In [15]:
for target_col in target_cols:
    train_X = copy.deepcopy(train_features)
    test_X = copy.deepcopy(test_features)
    train_X, test_X = target_encoder(train_X, train_targets_scored[target_col], test_X, object_cols)
    _oof, _preds = run_lgbm(train_X, test_X, target_col)
    oof[target_col] = _oof
    submission[target_col] = _preds

categorical features:['cp_type', 'cp_dose']




Training until validation scores don't improve for 100 rounds
[10]	training's binary_logloss: 0.00236783	valid_1's binary_logloss: 0.0068585
[20]	training's binary_logloss: 0.00181572	valid_1's binary_logloss: 0.00688485
[30]	training's binary_logloss: 0.00150247	valid_1's binary_logloss: 0.00691576
[40]	training's binary_logloss: 0.00128277	valid_1's binary_logloss: 0.00695037
[50]	training's binary_logloss: 0.00111277	valid_1's binary_logloss: 0.00698997
[60]	training's binary_logloss: 0.000976128	valid_1's binary_logloss: 0.00702704
[70]	training's binary_logloss: 0.000863108	valid_1's binary_logloss: 0.00702334
[80]	training's binary_logloss: 0.00076729	valid_1's binary_logloss: 0.00703847
[90]	training's binary_logloss: 0.000684898	valid_1's binary_logloss: 0.00706433
[100]	training's binary_logloss: 0.000613379	valid_1's binary_logloss: 0.0070939
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.00479453	valid_1's binary_logloss: 0.00681599
Training until valida

[20]	training's binary_logloss: 0.00179818	valid_1's binary_logloss: 0.0100447
[30]	training's binary_logloss: 0.00148971	valid_1's binary_logloss: 0.0100949
[40]	training's binary_logloss: 0.0012776	valid_1's binary_logloss: 0.0101784
[50]	training's binary_logloss: 0.00111616	valid_1's binary_logloss: 0.0102491
[60]	training's binary_logloss: 0.000984331	valid_1's binary_logloss: 0.010325
[70]	training's binary_logloss: 0.00087552	valid_1's binary_logloss: 0.0103921
[80]	training's binary_logloss: 0.000782081	valid_1's binary_logloss: 0.0104816
[90]	training's binary_logloss: 0.000700959	valid_1's binary_logloss: 0.0105739
[100]	training's binary_logloss: 0.000629586	valid_1's binary_logloss: 0.0106743
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.0049334	valid_1's binary_logloss: 0.00992439
Training until validation scores don't improve for 100 rounds
[10]	training's binary_logloss: 0.0030805	valid_1's binary_logloss: 0.00686536
[20]	training's binary_logloss: 

Training until validation scores don't improve for 100 rounds
[10]	training's binary_logloss: 0.0029529	valid_1's binary_logloss: 0.0144428
[20]	training's binary_logloss: 0.00221003	valid_1's binary_logloss: 0.0145691
[30]	training's binary_logloss: 0.00181431	valid_1's binary_logloss: 0.0146968
[40]	training's binary_logloss: 0.00154587	valid_1's binary_logloss: 0.0148294
[50]	training's binary_logloss: 0.0013411	valid_1's binary_logloss: 0.0149672
[60]	training's binary_logloss: 0.0011779	valid_1's binary_logloss: 0.0151094
[70]	training's binary_logloss: 0.00104274	valid_1's binary_logloss: 0.0152559
[80]	training's binary_logloss: 0.000928288	valid_1's binary_logloss: 0.0154063
[90]	training's binary_logloss: 0.000827992	valid_1's binary_logloss: 0.0155596
[100]	training's binary_logloss: 0.000742805	valid_1's binary_logloss: 0.0157166
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.00536892	valid_1's binary_logloss: 0.0143093
Training until validation scores d

[50]	training's binary_logloss: 0.0305286	valid_1's binary_logloss: 0.0464546
[60]	training's binary_logloss: 0.0283965	valid_1's binary_logloss: 0.0465035
[70]	training's binary_logloss: 0.0265709	valid_1's binary_logloss: 0.0464856
[80]	training's binary_logloss: 0.0248943	valid_1's binary_logloss: 0.0464635
[90]	training's binary_logloss: 0.0232547	valid_1's binary_logloss: 0.0465283
[100]	training's binary_logloss: 0.0218308	valid_1's binary_logloss: 0.0466323
[110]	training's binary_logloss: 0.0206083	valid_1's binary_logloss: 0.0467206
[120]	training's binary_logloss: 0.0194806	valid_1's binary_logloss: 0.0467379
[130]	training's binary_logloss: 0.0185161	valid_1's binary_logloss: 0.0468061
Early stopping, best iteration is:
[38]	training's binary_logloss: 0.03359	valid_1's binary_logloss: 0.0463203
Training until validation scores don't improve for 100 rounds
[10]	training's binary_logloss: 0.0446387	valid_1's binary_logloss: 0.0434546
[20]	training's binary_logloss: 0.0408885	v

KeyboardInterrupt: 

In [None]:
scores = []
for target_col in target_cols:
    scores.append(log_loss(train_targets_scored[target_col], oof[target_col]))
print(np.mean(scores))

In [None]:
if debug == False:
    submission.to_csv("submission.csv", index=False)
    submission.head()

## optuna

In [None]:
def objective(trial):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        #'metric': 'rmse',
        'verbosity': -1,
        "seed":42,
        "learning_rate":trial.suggest_loguniform('learning_rate', 0.005, 0.03),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }
    FOLD_NUM = 5
    mskf = MultilabelStratifiedKFold(n_splits=FOLD_NUM, shuffle=True, random_state=42)
    scores = []
    feature_importance_df = pd.DataFrame()

    pred_cv = np.zeros(len(test.index))
    num_round = 10000

    


    for i, (tdx, vdx) in enumerate(kf.split(train_X[selected], train_y)):
        print(f'Fold : {i}')
        X_train, X_valid, y_train, y_valid = train_X[selected].iloc[tdx], train_X[selected].iloc[vdx], train_y.values[tdx], train_y.values[vdx]
        lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=object_cols)
        lgb_valid = lgb.Dataset(X_valid, y_valid, categorical_feature=object_cols)
        model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      early_stopping_rounds=100, verbose_eval=10000)

        va_pred = model.predict(X_valid)
        va_pred[va_pred<0] = 0
        score_ = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(va_pred)))
        scores.append(score_)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
# 結果の確認
print('Best trial:')
light_trial = study.best_trial

print('  Value: {}'.format(light_trial.value))

print('  Params: ')

In [None]:
with open("lightgbmparams.txt", "w") as file:
    for key, value in light_trial.params.items():
       print('    "{}": {},'.format(key, value))
       file.write('"{}": {},'.format(key, value))