<a href="https://colab.research.google.com/github/Tyanakai/medical_paper_classification/blob/main/medical_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>医学論文の自動仕分けチャレンジ　アンサンブル</h1>


# １. はじめに
本ノートブックは、[medical_bert_tf.ipynb](https://github.com/Tyanakai/medical_paper_classification/blob/main/medical_bert_tf.ipynb)で得た数種類の予測値(probability)を元に、アンサンブル手法を評価し、提出ファイルを作成するものです。<br>
試行するアンサンブル手法は以下の通りです。


*   random forest
*   decision tree
*   svm
*   logistic regression

尚、colabratory上での実行を想定しています。


# ２. 事前に完了していること
- [medical_eda.ipynb](https://github.com/Tyanakai/medical_paper_classification/blob/main/medical_EDA.ipynb)を実行
- [medical_bert_tf.ipynb](https://github.com/Tyanakai/medical_paper_classification/blob/main/medical_bert_tf.ipynb)を実行し、数種類のモデルでout of foldデータとtestデータに対する予測値(probability)を予測、保存

# ３. 環境準備
実行環境を構築します。

## 3.1 ライブラリ

In [None]:
import datetime
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

## 3.2 Google Driveマウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3.3 config

In [None]:
class Config:
    train_file = "p_train.csv"
    test_file = "p_test.csv"
    target_col = "judgement"

## 3.4 pathの設定

In [None]:
DRIVE = "/content/drive/MyDrive/signate/medical_paper"
INPUT = os.path.join(DRIVE, "input")
OUTPUT = os.path.join(DRIVE, "output")
SUBMIT = os.path.join(DRIVE, "submit")
PROB = os.path.join(DRIVE, "prob")

# ４. データ準備
[medical_bert_tf.ipynb](https://github.com/Tyanakai/medical_paper_classification/blob/main/medical_bert_tf.ipynb)で得た
oofとtestデータに対する予測値(probability)をそれぞれtrainデータ、testデータに結合します。
またアンサンブルで使用する線形モデル用に、標準化しておきます。

In [None]:
# 使用するファイル名
oof_file_names = [
                  "oof_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20211004_1012.csv",
                  "oof_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20210918_0958.csv",
                  "oof_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20211001_1452.csv",
                  "oof_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20211002_1505.csv",
                  "oof_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20210920_1742.csv",
                  "oof_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20210924_1715.csv",
                  "oof_kamalkraj-bioelectra-base-discriminator-pubmed-pmc_20211004_1357.csv",
                  "oof_dmis-lab-biobert-base-cased-v1.2_20211004_2013.csv"
                  ]
pred_file_names = [
                   "prob_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20211004_1012.csv",
                   "prob_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20210918_0958.csv",
                   "prob_microsoft-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_20211001_1452.csv",
                   "prob_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20211002_1505.csv",
                   "prob_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20210920_1742.csv",
                   "prob_cambridgeltl-SapBERT-from-PubMedBERT-fulltext_20210924_1715.csv",
                   "prob_kamalkraj-bioelectra-base-discriminator-pubmed-pmc_20211004_1357.csv",
                   "prob_dmis-lab-biobert-base-cased-v1.2_20211004_2013.csv"
                   ]

In [None]:
def get_data(file_name):
    """
    データを取得する。
    通常の前処理に加え、abstractの文字数を計算した特徴量を作る。
    """
    df = pd.read_csv(os.path.join(INPUT, file_name))
    if Config.debug:
        df = df.sample(256, random_state=Config.seeds[0]).reset_index(drop=True)

    # preprocess
    df["abstract"] = df["abstract"].fillna("")
    df["abstract_len"] = df["abstract"].str.len()
    return df

In [None]:
# データ取得
train_df = get_data(Config.train_file)
test_df = get_data(Config.test_file)

# 結合
for oof_f in oof_file_names:
    oof_df = pd.read_csv(os.path.join(PROB, oof_f))
    train_df = pd.concat([train_df, oof_df], axis=1)

for pred_f in pred_file_names:
    pred_df =pd.read_csv(os.path.join(PROB, pred_f))
    test_df = pd.concat([test_df, pred_df], axis=1)

# 標準化
scaler = StandardScaler()
std_train = scaler.fit_transform(train_df.iloc[:,i+5].values)
std_test = scaler.transform(test_df.iloc[:,i+5].values)

# ５. アンサンブル評価
各種アンサンブル手法に対しパラメータを探索し、提出ファイルを作成します。

## 5.1 基準スコア
アンサンブルを行わない状態でのスコアを算出し、基準とします。

In [None]:
def opt_fbeta_threshold(y_true, y_pred):
    """
    fbeta score計算時のthresholdを最適化
    """
    def opt_(x): 
        return -fbeta_score(y_true, y_pred >= x, beta=7)
    result = minimize(opt_, x0=0.1, method='Powell') 
    best_threshold = result['x'].item()
    return best_threshold


def metrics(y_true, y_pred):
    """
    fbeta(beta=7)の閾値最適化評価関数
    """
    bt = opt_fbeta_threshold(y_true, y_pred)
    # print(f"bt:{bt}")
    score = fbeta_score(y_true, y_pred >= bt, beta=7)
    return score

In [None]:
base_score_list = []
for i in range(len(oof_file_names)):
    base_score = metrics(train_df.judgement.values, train_df.iloc[:,i+5].values)
    base_score_list.append(base_score)

base_score_list

[0.9129741599735282,
 0.9081902245706738,
 0.9209888511875909,
 0.9116578054128742,
 0.91827781307676,
 0.904078728954233,
 0.9068252109419571,
 0.9170749193452024,
 0.9098552639481112]

## 5.2 random tree model

In [None]:
# 探索するパラメータ空間
params = {
    "max_depth":np.arange(1, 10, 1),
    "n_estimators":np.arange(3, 100, 2)
    }

# 指標の準備
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# modelの準備
rf_model = RandomForestClassifier(class_weight="balanced")

# 探索実行
gscv = GridSearchCV(rf_model, params, cv=5, verbose=1, scoring=ftwo_scorer)
gscv.fit(train_df.iloc[:,i+5].values, train_df.judgement.values)

# 結果表示
print(f"best params : {gscv.best_params_}, best score : {gscv.best_score_}")

In [None]:
# best modelで予測
best_model = gscv.best_estimator_
pred = best_model.predict(test_df.iloc[:,4:])

# 提出ファイルの保存
submit_df = pd.read_csv(os.path.join(INPUT, "sample_submit.csv"), 
                        header=None, names=["id", "judgement"])
submit_df["judgement"] = pred.astype(np.int16)
submit_df.to_csv(os.path.join(SUBMIT, "ensemble_1004_rf.csv"), index=False, header=False)
submit_df

## 5.3 decision model

In [None]:
# 探索するパラメータ空間
params = {
    "max_depth":np.arange(1, 10, 1)
    }

# 指標の準備
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# modelの準備
dt_model = DecisionTreeClassifier(max_depth=2, class_weight="balanced")

# 探索実行
gscv = GridSearchCV(dt_model, params, cv=5, verbose=1, scoring=ftwo_scorer)
gscv.fit(train_df.iloc[:,i+5].values, train_df.judgement.values)

# 結果表示
print(f"best params : {gscv.best_params_}, best score : {gscv.best_score_}")

In [None]:
# best modelで予測
best_model = gscv.best_estimator_
pred = best_model.predict(test_df.iloc[:,4:])

# 提出ファイルの保存
submit_df = pd.read_csv(os.path.join(INPUT, "sample_submit.csv"), 
                        header=None, names=["id", "judgement"])
submit_df["judgement"] = pred.astype(np.int16)
submit_df.to_csv(os.path.join(SUBMIT, "ensemble_1004_dt.csv"), index=False, header=False)
submit_df

Unnamed: 0,id,judgement
0,27145,0
1,27146,0
2,27147,0
3,27148,0
4,27149,0
...,...,...
40829,67974,0
40830,67975,0
40831,67976,0
40832,67977,0


## 5.4 svm model

In [None]:
# 探索するパラメータ空間
params =  {
      'C':[1, 10, 100],
      'kernel':['rbf', 'linear', 'poly'],
      'degree':np.arange(1, 6, 1),
      'gamma':np.linspace(0.01, 1.0, 50)
      }

# 指標の準備
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# modelの準備
svm_model = SVC(class_weight="balanced")

# 探索実行
gscv = GridSearchCV(svm_model, params, cv=5, verbose=1, scoring=ftwo_scorer)
gscv.fit(std_train, train_df.judgement.values)

# 結果表示
print(f"best params : {gscv.best_params_}, best score : {gscv.best_score_}")

In [None]:
# best modelで予測
best_model = gscv.best_estimator_
pred = best_model.predict(std_test)

# 提出ファイルの保存
submit_df = pd.read_csv(os.path.join(INPUT, "sample_submit.csv"), 
                        header=None, names=["id", "judgement"])
submit_df["judgement"] = pred.astype(np.int16)
submit_df.to_csv(os.path.join(SUBMIT, "ensemble_1004_svm.csv"), index=False, header=False)
submit_df

Unnamed: 0,id,judgement
0,27145,0
1,27146,0
2,27147,0
3,27148,0
4,27149,0
...,...,...
40829,67974,0
40830,67975,0
40831,67976,0
40832,67977,1


## 5.5 logistic regression

In [None]:
# 探索するパラメータ空間
params =  {
      'C':np.arange(1,10,1),
      }

# 指標の準備
ftwo_scorer = make_scorer(fbeta_score, beta=2)

# modelの準備
lr_model =  LogisticRegression(class_weight="balanced")

# 探索実行
gscv = GridSearchCV(lr_model, params, cv=5, verbose=1, scoring=ftwo_scorer)
gscv.fit(std_train, train_df.judgement.values)

# 結果表示
print(f"best params : {gscv.best_params_}, best score : {gscv.best_score_}")

In [None]:
# best modelで予測
best_model = gscv.best_estimator_
pred = best_model.predict(std_test)

# 提出ファイルの保存
submit_df = pd.read_csv(os.path.join(INPUT, "sample_submit.csv"), 
                        header=None, names=["id", "judgement"])
submit_df["judgement"] = pred.astype(np.int16)
submit_df.to_csv(os.path.join(SUBMIT, "ensemble_1004_lr.csv"), index=False, header=False)
submit_df