<a href="https://colab.research.google.com/github/Tyanakai/medical_paper_classification/blob/main/medical_bert_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## config etc

In [None]:
! pip install -q transformers
! pip install -q tensorflow-addons

import datetime
import json
import logging
import os
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize
from scipy.optimize import minimize_scalar
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
import transformers

[K     |████████████████████████████████| 2.9 MB 4.6 MB/s 
[K     |████████████████████████████████| 636 kB 43.8 MB/s 
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[K     |████████████████████████████████| 895 kB 78.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 49.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 5.3 MB/s 
[?25h

In [None]:
class Config:
    model = "dmis-lab/biobert-base-cased-v1.2" #@param
    from_pt = True #@param {"type":"boolean"}
    encode_type = "cls_cat" #@param {"type","string"} ["cls","cls_cat","pooler","logits", "last_hidden_state_cnn", "last_hidden_state_lstm"]

    max_length = 512 #@param {"type":"integer"}
    lr = 0.00002
    weight_decay = 1e-5
    opt = "minimize"  #@param {"type":"string"} ["minimize_scalar","minimize"]
    n_fold = 5 #@param
    epochs = 15 #@param {"type":"slider"}
    patience =  4#@param
    check_monitor = "val_fbeta_score" #@param {"type":"string"} ["val_loss","val_fbeta_score","val_auc"]
    check_mode = "max" #@param {"type":"string"} ["auto", "max"]
    
    train_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    valid_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    test_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    steps_per_epochs = None #(27145 * (n_fold - 1) / n_fold) // train_batch_size
    train_file = "ps_train.csv" #@param
    test_file = "ps_test.csv" #@param
    target_col = "judgement"
    text_col = "text"
    seeds = [21]

    loss_fn = "bce" #@param {"type":"string"} ["bce", "weighted_bce", "focal"]
    loss_weight = [1, 50] #@param
    class_weight = "balanced" #@param {"type":"raw"} 
    sample_weight = None #@param    
    label_smoothing = 0 #@param
    
    submit = True #@param {"type":"boolean"}
    debug = False  #@param {"type":"boolean"}
    temp_thre = 0.1 #@param

if Config.debug:
    Config.epochs = 2
    Config.n_fold = 2

time_jp = (datetime.datetime.now() + 
           datetime.timedelta(hours=9)).strftime('%Y%m%d_%H%M')
# time_jp = '20210926_1749' #@param
time_jp

'20211004_2013'

In [None]:
def build_encoder():
    """
    encoderの出力形式(Config.encode_type)に従って
    設定を変化させたencoderを返します。
    """
    if Config.encode_type == "logits":
        encoder = (
            transformers
            .TFAutoModelForSequenceClassification
            .from_pretrained(Config.model, num_labels=1, from_pt=Config.from_pt)
        )

    elif Config.encode_type == "cls_cat":
        config = transformers.AutoConfig.from_pretrained(Config.model,
                                                         output_hidden_states=True)
        encoder = (
            transformers
            .TFAutoModel
            .from_pretrained(Config.model, config=config, from_pt=Config.from_pt)
        )

    else:
        encoder = (
            transformers
            .TFAutoModel
            .from_pretrained(Config.model, from_pt=Config.from_pt)
        )

    return encoder

def neural_networks(x):
    """
    encoderの出力形式(Config.encode_type)に従って
    encoder以降の構造を定義します。
    """

    if Config.encode_type == "cls":
        x = x[0][:, 0, :]  # cls token
        output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    elif Config.encode_type == "cls_cat":
        # encoderの最終四層分のcls tokenを連結します。
        x = tf.concat([x["hidden_states"][-i][:,0,:] for i in range(1,5)], axis=-1)
        x = tf.keras.layers.Dropout(0.2)(x)
        output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    elif Config.encode_type == "pooler":
        x = x["pooler_output"]
        output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    elif Config.encode_type == "logits":
        x = x.logits
        output = tf.keras.layers.Activation("sigmoid")(x)

    elif Config.encode_type == "last_hidden_state_cnn":
        # encoderの最終出力を１次元のCNNで処理します。
        x = x.last_hidden_state
        x = tf.keras.layers.Conv1D(
            256, kernel_size=2, padding="same", activation="relu")(x)
        x = tf.keras.layers.Conv1D(
            1, kernel_size=2, padding="same")(x)
        x = tf.keras.layers.GlobalMaxpooling1D()(x)
        output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        
    elif Config.encode_type == "last_hidden_state_lstm":
        # encoderの最終出力を双方向LSTMで処理します。
        x = x.last_hidden_state
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(728))(x)
        output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    return output


def build_model():
    """
    使用するkerasモデルの全体像を定義します。
    """
    # encoder
    encoder = build_encoder()

    # 入力
    input_ids = tf.keras.layers.Input(shape=(Config.max_length, ), 
                                           dtype=tf.int32, 
                                           name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(Config.max_length, ),
                                           dtype=tf.int32, 
                                           name='attention_mask')
    
    # ニューラルネットワーク全体構造
    x = encoder(input_ids=input_ids, 
                attention_mask=attention_mask, 
                output_hidden_states=True)
    output = neural_networks(x)

    # kerasモデル化
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask],
                                  outputs=[output])

    # 最適化アルゴリズムと損失関数
    optimizer = tfa.optimizers.AdamW(lr=Config.lr, weight_decay=Config.weight_decay)
    loss = {"bce": tf.keras.losses.BinaryCrossentropy(),
            "weighted_bce": weighted_binary_crossentropy(Config.loss_weight, Config.label_smoothing),
            "focal": tfa.losses.SigmoidFocalCrossEntropy(alpha=0.98, gamma=2.0),
            "mse": tf.keras.losses.MeanSquaredError()}

    # 訓練中監視する指標
    metrics = [tfa.metrics.FBetaScore(num_classes=1,
                                      beta=7.0,
                                      threshold=Config.temp_thre),
               tf.keras.metrics.AUC(num_thresholds=200, curve='PR',
                                    multi_label=False, label_weights=None)]

    model.compile(optimizer=optimizer,
                  loss=loss[Config.loss_fn], 
                  metrics=metrics)
    # model.summary()
    return model

In [None]:
# pathの設定
DRIVE = "/content/drive/MyDrive/signate/medical_paper"
INPUT = os.path.join(DRIVE, "input")
OUTPUT = os.path.join(DRIVE, "output")
LOG = os.path.join(OUTPUT, f"{Config.model.replace('/','-')}_tf")
MODEL = os.path.join(DRIVE, "model", f"{Config.model.replace('/','-')}_tf")
SUBMIT = os.path.join(DRIVE, "submit")
PROB = os.path.join(DRIVE, "prob")

os.makedirs(MODEL, exist_ok=True)
os.makedirs(LOG, exist_ok=True)

# warnings.filterwarnings("ignore")

In [None]:
# Loggerの設定
class Logger:
    """log を残す用のクラス"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(__name__)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, f'Experiment{time_jp}.log'))
        if len(self.general_logger.handlers) == 0:
            # self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        cur_time = datetime.datetime.now() + datetime.timedelta(hours=9)
        return cur_time.strftime('%Y-%m-%d %H:%M:%S')

logger = Logger(LOG)

Running on TPU  ['10.49.54.90:8470']


In [None]:
# TPU設定
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', TPU.cluster_spec().as_dict()['worker'])
except ValueError:
    TPU = None
    print('INFO: Not connected to a TPU runtime')

In [None]:
# data準備

def get_data(file_name):
    df = pd.read_csv(os.path.join(INPUT, file_name))
    if Config.debug:
        df = df.sample(256, random_state=Config.seeds[0]).reset_index(drop=True)

    # preprocess
    df["text"] = df["title"] + " " + df["abstract"].fillna("")
    return df

def skf(train, n_splits, random_state):
    """
    層化K分割したindexのリストを返す
    """
    if n_splits > 1:
        skf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
        return list(skf.split(train, train[Config.target_col]))
    else:
        return train.index


def tokenize_texts(texts, tokenizer, max_length):
    """
    keyが"input_ids"と"attention_mask"の辞書を返す
    """
    tokenized_dict = tokenizer.batch_encode_plus(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_token_type_ids=False,
    )
    return dict(tokenized_dict)


def get_dataset(X, y=None, dataset="test"):
    """データをtf.data.Datasetの形式に変更"""

    if dataset=="train":
        tr_ds = tf.data.Dataset.from_tensor_slices((X, y))
        if Config.steps_per_epochs is not None:
            tr_ds = tr_ds.repeat()
        tr_ds = tr_ds.shuffle(2048)
        tr_ds = tr_ds.batch(Config.train_batch_size)
        tr_ds = tr_ds.prefetch(tf.data.experimental.AUTOTUNE)

        return tr_ds

    elif dataset=="valid":
        val_ds = tf.data.Dataset.from_tensor_slices((X, y))
        val_ds = val_ds.batch(Config.valid_batch_size)
        val_ds = val_ds.prefetch(tf.data.experimental.AUTOTUNE)
        return val_ds
    
    elif dataset=="test":
        test_ds = tf.data.Dataset.from_tensor_slices(X)
        test_ds = test_ds.batch(Config.test_batch_size)
        test_ds = test_ds.prefetch(tf.data.experimental.AUTOTUNE)
        return test_ds




In [None]:
def weighted_binary_crossentropy(weight, label_smoothing):
    """
    label毎に異なる重みを付加したcrossentropy loss関数
    """
    weight = tf.convert_to_tensor(weight, dtype=tf.float32)

    def _weighted_binary_crossentropy(target, output):
        """
        label smoothingに対応した一般的なcrossentropy loss関数
        keras公式の実装を参考に実装

        """
        if Config.label_smoothing:
            target = target * (1.0 - label_smoothing) + 0.5 * label_smoothing
        target = tf.convert_to_tensor(target, dtype=tf.float32)
        target = tf.reshape(target, [-1])

        output = tf.convert_to_tensor(output, dtype=tf.float32)
        output = tf.reshape(output, [-1])
        epsilon_ = K.epsilon()
        output = tf.clip_by_value(output, epsilon_, 1. - epsilon_)

        bce = weight[1] * target * tf.math.log(output + K.epsilon())
        bce += weight[0] * (1 - target) * tf.math.log(1 - output + K.epsilon())
        return -bce

    return _weighted_binary_crossentropy


def opt_fbeta_threshold(y_true, y_pred):
    """
    fbeta scoreに対する閾値を最適化
    """
    def opt_(x): 
        return -fbeta_score(y_true, y_pred>=x, beta=7)
    if Config.opt == "minimize":
        result = minimize(opt_, x0=np.array([0.1]), method="Powell")
    elif Config.opt == "minimize_scalar":
        result = minimize_scalar(opt_, bounds=(0.001, 0.85), method='bounded')
    opted_threshold = result['x'].item()
    return opted_threshold


def metrics(y_true, y_pred):
    """
    最適化された閾値とその時のfbeta scoreを取得。
    """
    opted_thre = opt_fbeta_threshold(y_true, y_pred)
    print(f"opted threshold : {opted_thre}")
    score = fbeta_score(y_true, y_pred >= opted_thre, beta=7)
    return score, opted_thre






def get_model_and_tokenizer():
    """
    modelとtokenizerを取得
    """
    if TPU:
        tf.config.experimental_connect_to_cluster(TPU)
        tf.tpu.experimental.initialize_tpu_system(TPU)
        tpu_strategy = tf.distribute.TPUStrategy(TPU)
        with tpu_strategy.scope():
            model = build_model()
    else:
        model = build_model()

    tokenizer = transformers.AutoTokenizer.from_pretrained(Config.model)
    return model, tokenizer


def get_class_weight(target, weight):
    """
    class weightを取得する。
    """
    if weight == "balanced":
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(target),
            y=target)
        class_weights = dict(enumerate(class_weights))
    elif weight is None:
        class_weights = None
    else:
        # ex) weight = {0:0.2, 1:0.98}
        class_weights = weight
    
    return class_weights


In [None]:
# 訓練結果の表示
def visualize_confusion_matrix(
        y_true,
        pred_label,
        height=.6,
        labels=None):
    """
    混合行列を表示
    """
    conf = confusion_matrix(y_true=y_true,
                            y_pred=pred_label,
                            normalize='true')

    n_labels = len(conf)
    size = n_labels * height
    fig, ax = plt.subplots(figsize=(size * 4, size * 3))
    sns.heatmap(conf, cmap='Blues', ax=ax, annot=True, fmt='.2f')
    ax.set_ylabel('Label')
    ax.set_xlabel('Predict')

    if labels is not None:
        ax.set_yticklabels(labels)
        ax.set_xticklabels(labels)
        ax.tick_params('y', labelrotation=0)
        ax.tick_params('x', labelrotation=90)

    plt.show()
    return fig



def get_result(y_true, y_pred, thresholds=None):
    thre_df = pd.DataFrame(columns=["threshold", "score"])

    naive_score = fbeta_score(y_true, y_pred >= Config.temp_thre, beta=7.0)
    opted_score, opted_thre = metrics(y_true, y_pred)
    type_list = ["opted", "temporary"]
    thre_list = [opted_thre, Config.temp_thre]
    score_list = [opted_score, naive_score]

    if type(thresholds) is list:
        thresholds = np.array(thresholds)
        min_thre = thresholds.min()
        max_thre = thresholds.max()
        mean_thre = thresholds.mean()
        med_thre = np.median(thresholds)

        min_score = fbeta_score(y_true, y_pred >= min_thre, beta=7.0)
        max_score = fbeta_score(y_true, y_pred >= max_thre, beta=7.0)
        mean_score = fbeta_score(y_true, y_pred >= mean_thre, beta=7.0)
        med_score = fbeta_score(y_true, y_pred >= med_thre, beta=7.0)

        type_list += ["min", "max", "mean","median"]
        thre_list += [min_thre, max_thre, mean_thre, med_thre]
        score_list += [min_score, max_score, mean_score, med_score] 

    thre_df["threshold"] = thre_list
    thre_df["score"] = score_list
    thre_df.index = type_list

    return thre_df

In [None]:
def train_fn(train_df, valid_df, model, tokenizer, filepath):
    """
    訓練関数
    """
    # data準備
    tr_text = tokenize_texts(texts=train_df[Config.text_col].tolist(), tokenizer=tokenizer, max_length=Config.max_length)
    val_text = tokenize_texts(texts=valid_df[Config.text_col].tolist(), tokenizer=tokenizer, max_length=Config.max_length)

    tr_dataset = get_dataset(X=tr_text, y=train_df[Config.target_col].values, dataset="train")
    val_dataset = get_dataset(X=va_text, y=valid_df[Config.target_col].values, dataset="valid")
    
    # callbacks
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath, 
        monitor=Config.check_monitor, 
        verbose=1, 
        save_best_only=True, 
        save_weights_only=True,
        mode=Config.check_mode
        )
    
    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=Config.patience
        )
    
    # fit
    history = model.fit(
        tr_dataset, 
        epochs=Config.epochs, 
        verbose=1, 
        callbacks=[checkpoint, earlystop],
        validation_data=val_dataset, 
        steps_per_epoch=Config.steps_per_epochs,
        class_weight=get_class_weight(train_df[Config.target_col], weight=Config.class_weight)
        )
    
    return history


def inference_fn(test_df, model, tokenizer, filepath):
    """
    推論関数
    """
    model.load_weights(filepath)
    te_text = tokenize_texts(texts=test_df[Config.text_col].tolist(), tokenizer=tokenizer, max_length=Config.max_length)
    te_dataset = get_dataset(X=te_text, y=None, dataset="test")
    preds = model.predict(te_dataset)
    return preds.reshape(-1)


def train_cv(train, cv, metrics, name, dir):
    """
    cross validationの実行関数 (train)
    """
    # oofの予測確率(probability)と最適化された閾値を保存
    oof = np.zeros(len(train))
    threshold_list = []

    # fold training
    for i_fold, (tr_idx, val_idx) in enumerate(cv):
        K.clear_session()
        print(f"\n===== FOLD {i_fold+1} training =====")
        filepath = os.path.join(dir, f"{name}_fold{i_fold+1}.h5")

        # model, tokenizer, dataの準備
        model, tokenizer = get_model_and_tokenizer()
        tr_df, val_df = train.iloc[tr_idx].reset_index(), train.iloc[val_idx].reset_index()
        

        if not os.path.isfile(filepath):  # 学習済みモデルがあればtrainingしない
            history = train_fn(tr_df, val_df, model, tokenizer, filepath)
            pd.DataFrame(history.history).to_csv(
                os.path.join(LOG, f"history{time_jp}_{i_fold+1}.csv"), 
                index=False)

        # oofの予測確率を計算し、
        # 最適化された閾値とスコアをpd.DataFrameとして取得
        preds = inference_fn(val_df, model, tokenizer, filepath)
        thre_df = get_result(val_df[Config.target_col], preds)

        # 最適化された閾値
        opted_thre = thre_df[thre_df.index=="opted"].threshold.values[0]

        logger.info(f"===== fold {i_fold+1} result =====")
        logger.info(f">>> {thre_df.to_dict()}")

        # oofの予測確率値と最適化された閾値を保存
        oof[val_idx] = preds
        threshold_list.append(opted_thre)
    
    # oof全体の最適な閾値と、threshold_listから得た閾値でスコアを計算
    thre_df = get_result(train[Config.target_col], oof, threshold_list)
    logger.info(f"===== total result =====")
    logger.info(f">>> threshold:{thre_df.to_dict()['threshold']}")
    logger.info(f">>> score:{thre_df.to_dict()['score']}")
    return oof, threshold_list


def predict_cv(test, name, dir):
    """
    cross validationの実行関数 (test)
    """
    preds_fold = []
    for i_fold in range(Config.n_fold):
        filepath = os.path.join(dir, f"{name}_fold{i_fold+1}.h5")
        model, tokenizer = get_model_and_tokenizer()

        preds = inference_fn(test, model, tokenizer, filepath)
        preds_fold.append(preds)

        logger.info(f"===== fold{i_fold+1} inference =====")
    
    preds = np.mean(preds_fold, axis=0)
    return preds

In [None]:
###

def submit_with_thresholds(type_list, threshold_list):
    name = f"{Config.model.replace('/','-')}_{time_jp}"
    prob_df = pd.read_csv(os.path.join(PROB, f"prob_{name}.csv"))
    submit_df = pd.read_csv(os.path.join(INPUT, "sample_submit.csv"), 
                            header=None, 
                            names=["id", "judgement"])
    if Config.debug:
        submit_df = submit_df.iloc[get_data(Config.test_file).index.values]
        
    # submit file
    for key, threshold in zip(type_list, threshold_list):
        predictions = (prob_df[f"{name}_seed{Config.seeds[0]}"].values >= threshold) * 1
        filepath = f"{name}_{key}.csv"
        submit_df["judgement"] = predictions
        submit_df.to_csv(os.path.join(SUBMIT, filepath), 
                         index=False, 
                         header=False)
        logger.info(f"saved file : {name}_{key}.csv")

実行　main()

In [None]:
def main():
    model_name = f"{Config.model.replace('/','-')}_{time_jp}"
    logger.info(f"{model_name} TRAINING")
    # load data
    train_df = get_data(Config.train_file)
    test_df = get_data(Config.test_file)

    # training
    oof_df = pd.DataFrame()
    threshold_list = []
    for seed in Config.seeds:
        name = f"{model_name}_seed{seed}"
        logger.info(f"***** SEED{seed} *****")
        oof, seed_thre_list = train_cv(
            train_df, 
            cv=skf(train_df, n_splits=Config.n_fold, random_state=seed),
            metrics=metrics, 
            name=name, 
            dir=MODEL)
        oof_df[name] = oof
        threshold_list += seed_thre_list

    # save oof
    oof_df.to_csv(os.path.join(PROB, f"oof_{model_name}.csv"), index=False)
    logger.info(f"saved file : oof_{model_name}.csv")

    # recode seeds total score
    y_true = train_df[Config.target_col].values
    y_pred = oof_df.mean(axis=1).values
    thre_df = get_result(y_true, y_pred, threshold_list)
    
    logger.info(f"***** seeds total result *****")
    logger.info(f">>> threshold:{thre_df.to_dict()['threshold']}")
    logger.info(f">>> score:{thre_df.to_dict()['score']}")

    # conf matrix with best threshold
    opted_thre = thre_df[thre_df.index=="opted"].threshold.values[0]
    fig = visualize_confusion_matrix(y_true, y_pred>=opted_thre)
    fig.savefig(os.path.join(LOG, f"cm_{time_jp}.png"), dpi=300)

    # save prob file
    preds_df = pd.DataFrame()
    for seed in Config.seeds:
        name = f"{model_name}_seed{seed}"
        preds = predict_cv(test_df, name, dir=MODEL)
        preds_df[name] = preds

    preds_df.to_csv(os.path.join(PROB, f"prob_{model_name}.csv"), index=False)  # test予測値(prob)を保存
    logger.info(f"saved file : prob_{name}.csv")

    # submit
    if Config.submit:
        submit_with_thresholds(thre_df.index, thre_df.threshold)

    return thre_df

## execute

In [None]:
thre_df = main()

In [None]:
code_text = ""
with open(os.path.join(DRIVE, "medical_bert_tf.ipynb"), mode="r") as f:
    code = f.read()

for i in [2,3]:
    code_text += "".join(json.loads(code)["cells"][i]["source"])+"\n\n"

logger.info(code_text)

INFO:__main__:[2021-10-04 22:22:41] - class Config:
    model = "dmis-lab/biobert-base-cased-v1.2" #@param
    from_pt = True #@param {"type":"boolean"}
    encode_type = "cls_cat" #@param {"type","string"} ["cls","cls_cat","pooler","logits", "last_hidden_state_cnn", "last_hidden_state_lstm"]

    max_length = 512 #@param {"type":"integer"}
    lr = 0.00002
    weight_decay = 1e-5
    opt = "minimize"  #@param {"type":"string"} ["minimize_scalar","minimize"]
    n_fold = 5 #@param
    epochs = 15 #@param {"type":"slider"}
    patience =  4#@param
    check_monitor = "val_fbeta_score" #@param {"type":"string"} ["val_loss","val_fbeta_score","val_auc"]
    check_mode = "max" #@param {"type":"string"} ["auto", "max"]
    
    train_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    valid_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    test_batch_size = 64 #@param {"type":"raw"} [4,8,16,32,64]
    steps_per_epochs = None #(27145 * (n_fold - 1) / n_fold) // train_batch_size


## submit with thresholds

In [None]:
submit_with_thresholds(thre_df.index, thre_df.threshold)

INFO:__main__:[2021-09-20 22:13:55] - saved file : cambridgeltl-BioRedditBERT-uncased_20210920_1742_opted.csv
INFO:__main__:[2021-09-20 22:13:55] - saved file : cambridgeltl-BioRedditBERT-uncased_20210920_1742_temporary.csv
INFO:__main__:[2021-09-20 22:13:55] - saved file : cambridgeltl-BioRedditBERT-uncased_20210920_1742_min.csv
INFO:__main__:[2021-09-20 22:13:55] - saved file : cambridgeltl-BioRedditBERT-uncased_20210920_1742_max.csv
INFO:__main__:[2021-09-20 22:13:55] - saved file : cambridgeltl-BioRedditBERT-uncased_20210920_1742_mean.csv
