# Persiapan (mount Google Drive, install Transformers, dan import package yang perlu)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 15.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 68.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('INFO')

# Load data dan preprocess data awal

In [4]:
train_doc = pd.read_csv('/content/drive/MyDrive/IR/nfcorpus/train.docs',
                        sep='\t', names=['doc_id', 'content'], index_col='doc_id')

train_query_nt = pd.read_csv('/content/drive/MyDrive/IR/nfcorpus/train.nontopic-titles.queries',
                        sep='\t', names=['query_id', 'nontopic_query'], index_col='query_id')
train_query_vdesc = pd.read_csv('/content/drive/MyDrive/IR/nfcorpus/train.vid-desc.queries',
                        sep='\t', names=['query_id', 'vid_desc_query'], index_col='query_id')
train_query_vid = pd.read_csv('/content/drive/MyDrive/IR/nfcorpus/train.vid-titles.queries',
                        sep='\t', names=['query_id', 'vid_titles_query'], index_col='query_id')
train_qrel = pd.read_csv('/content/drive/MyDrive/IR/nfcorpus/train.3-2-1.qrel',
                        sep='\t', names=['query_id','dump','doc_id', 'relevance'])

train_qrel = train_qrel.drop(columns=['dump'])

In [5]:
print(train_qrel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139350 entries, 0 to 139349
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   query_id   139350 non-null  object
 1   doc_id     139350 non-null  object
 2   relevance  139350 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.2+ MB
None


## Menggabungkan query berjenis vid-titles, vid-desc, dan titles menjadi satu query

In [6]:
train_query_vid['vid_titles_query'] = train_query_vid['vid_titles_query'] \
                                      + train_query_vdesc['vid_desc_query']
train_query = train_query_nt.merge(train_query_vid, how='left', on='query_id')
train_query['query'] = train_query['nontopic_query'] \
                      + (' ' + train_query['vid_titles_query']).fillna('')
train_query = train_query.drop(columns=['nontopic_query', 'vid_titles_query'])
print(train_query)

                                                       query
query_id                                                    
PLAIN-10                 how contaminated are our children ?
PLAIN-100       cancer and the animal-to-plant protein ratio
PLAIN-103         how plant-based diets may extend our lives
PLAIN-104  a low methionine diet may help starve cancer c...
PLAIN-105  how animal proteins may trigger autoimmune dis...
...                                                      ...
PLAIN-94                    how to design a misleading study
PLAIN-95           how grapefruit affects prescription drugs
PLAIN-97   test to see if your diet is alkaline or acid f...
PLAIN-98            does animal protein cause osteoporosis ?
PLAIN-99                  quadrupling breast cancer survival

[1141 rows x 1 columns]


# Persiapan model BioClinicalBERT dan tokenizer-nya

In [7]:
from transformers import AutoTokenizer, TFBertModel

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
bert_model = TFBertModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", \
                                         output_attentions=True, \
                                         from_pt = True)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

# Preprocess data lanjutan

In [8]:
# relevance judgement tidak perlu one-hot encoding karena menggunakan
# SparseCategoricalCrossEntropy loss (dapat menerima labelnya langsung)

qd_train = []
labels = []
for q,d, r in zip(train_qrel['query_id'], train_qrel['doc_id'], train_qrel['relevance']):
  try:
    r = r-1
    qd_train.append((train_query.loc[q]['query'], train_doc.loc[d]['content']))
    labels.append(r)
  except KeyError:
    continue

In [9]:
def encode_texts(documents, max_length = None):
  if max_length == None:
    return tokenizer(documents, padding=True, truncation=True, return_tensors='tf')
  else:
    return tokenizer(documents, padding='max_length', max_length=max_length, truncation=True, return_tensors='tf')

In [10]:
# https://stackoverflow.com/questions/8290397/how-to-split-an-iterable-in-constant-size-chunks
# membuat batch agar tidak meledak RAM-nya
BATCH_SIZE = 4
def batch(qd_train, labels, batch_size=BATCH_SIZE):
    l = len(labels)
    for ndx in range(0, l, batch_size):
        yield qd_train[ndx:min(ndx + batch_size, l)], \
              labels[ndx:min(ndx + batch_size, l)]

In [11]:
batched_dataset = batch(qd_train, labels)

# Inisialisasi model BioClinicalBERT

In [12]:
from transformers import TFBertModel
from keras.layers import Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy

class MedlineModel(tf.keras.Model):

    def __init__(self, num_class=3,
                 model_name="emilyalsentzer/Bio_ClinicalBERT", dropout_prob=0.1):
        super().__init__(name="Medline_Model")
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.dropout = Dropout(dropout_prob)
        self.dense_classifier = Dense(num_class,name="dense_classifier")

    def call(self, inputs, **kwargs):
        # get pooler output for CLS embedding
        trained_bert = self.bert(inputs, **kwargs)
        cls_embed = trained_bert.pooler_output
        
        sequence_output = self.dropout(cls_embed,
                                       training=kwargs.get("training", False))
        output_logits = self.dense_classifier(sequence_output)

        return output_logits


In [13]:
model = MedlineModel()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

# Atur flag LOAD_WEIGHTS

In [14]:
# LOAD_WEIGHTS digunakan untuk melanjutkan training dari weight epoch sebelumnya.
# jika True, maka training akan dimulai dr weight sebelumnya.
# jika False, maka training dimulai dari weight random.

LOAD_WEIGHTS = True
if LOAD_WEIGHTS:
  model_shape = tokenizer('dummy', 'text', padding=True, truncation=True, max_length=300, return_tensors='tf')
  model(model_shape)
  model.load_weights('/content/drive/MyDrive/IR/model/weight_bioclinicalbert_epoch_10.h5')

# Definisikan loss function dan gradient descentnya secara manual

In [15]:
# Devlin et al. (penulis BERT) mengusulkan learning rate
# 3e-4, 1e-4, 5e-5, dan 3e-5 sebagai starting point, untuk Adam optimizer
LEARNING_RATE = 1e-4
RANDOM_STATE = 42
MAX_LENGTH = 300

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE)

In [16]:
# https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough#train_the_model
# Untuk menghindari data yang terlalu besar dalam sekali batch
# Buat fungsi sendiri untuk loss function dan gradient descentnya

def loss(model, encoded_qd, y, training):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(encoded_qd, training=training)

  return loss_object(y_true=y, y_pred=y_)

def grad(model, encoded_qd, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, encoded_qd, targets, training = True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

# Kode untuk training

In [17]:
from tqdm import tqdm

CURR_EPOCH = 11
EPOCHS = 1
for epoch in range(EPOCHS):
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  for qd, y in tqdm(batched_dataset, total=int(len(labels)/BATCH_SIZE)):
    # Optimize the model
    encoded_qd = encode_texts(qd, MAX_LENGTH)
    y = np.array(y)
    loss_value, grads = grad(model, encoded_qd, y)
    # optimizer.apply_gradients([
    #       (grad, var) 
    #       for (grad, var) in zip(grads, model.trainable_variables)
    #       if grad is not None
    #     ])
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss
    # # Compare predicted label to actual label
    # # training=True is needed only if there are layers with different
    # # behavior during training versus inference (e.g. Dropout).
    epoch_accuracy.update_state(y, model(encoded_qd, training=True))

  model.save_weights(f"/content/drive/MyDrive/IR/model/weight_bioclinicalbert_epoch_{CURR_EPOCH}.h5")
  print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(CURR_EPOCH,
                                                              epoch_loss_avg.result(),
                                                              epoch_accuracy.result()))

9346it [1:37:58,  1.59it/s]


Epoch 011: Loss: 0.732, Accuracy: 74.079%
