# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
root_path = "drive/MyDrive/NLP/Text Classification/"

In [None]:
! pip install Sastrawi
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Libraries

In [None]:
import numpy as np
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import trange
from transformers import BertTokenizer, BertForSequenceClassification

# Data + Preprocessing

## Reading the Data

In [None]:
def tokenize(sent):
    tokens = sent.split()
    tokens = list(filter(lambda token: len(token) > 1, tokens))
    return tokens

In [None]:
# Split label and features
def split_dataframe(df):
    df_features = df.loc[:, "text_a"].str.lower()

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    df_features = df_features.apply(lambda x: stopword.remove(x))

    return df_features, df.loc[:, "label"]

### Training Data

In [None]:
df_train = pd.read_csv(f"{root_path}train.csv")
df_train.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
X_train, y_train = split_dataframe(df_train)
# X_train, y_train = X_train[:100], y_train[:100]

### Test Data

In [None]:
df_test = pd.read_csv(f"{root_path}test.csv")

In [None]:
X_test, y_test = split_dataframe(df_test)

## EDA

In [None]:
# Mencari panjang rata-rata dari kalimat yang ada
df_train["text_a"].apply(lambda x: len(x.split())).mean()

15.61811953150317

# BERT

## Labelling

In [None]:
label_to_code = {'no': 0, 'yes': 1}

y_train = y_train.apply(lambda x : label_to_code[x])
y_test = y_test.apply(lambda x : label_to_code[x])

## Tokenize

In [None]:
def tokenize(train_data, test_data, max_length):
  tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
  )

  # Train Data
  input_ids_train = []
  attention_mask_train = []

  for text in train_data:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = max_length,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids_train.append(encoded['input_ids'])
    attention_mask_train.append(encoded['attention_mask'])

  input_ids_train = torch.cat(input_ids_train, dim = 0)
  attention_mask_train = torch.cat(attention_mask_train, dim = 0)
  y_tf_train = torch.tensor(y_train)

  # Test Data
  input_ids_test = []
  attention_mask_test = []

  for text in test_data:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = max_length,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids_test.append(encoded['input_ids'])
    attention_mask_test.append(encoded['attention_mask'])

  input_ids_test = torch.cat(input_ids_test, dim = 0)
  attention_mask_test = torch.cat(attention_mask_test, dim = 0)
  y_tf_test = torch.tensor(y_test)

  return input_ids_train, attention_mask_train, y_tf_train, input_ids_test, attention_mask_test, y_tf_test

## Data Load

In [None]:
def data_load(input_ids_train, attention_mask_train, y_tf_train, input_ids_test, attention_mask_test, y_tf_test) :
  train_set = TensorDataset(input_ids_train, attention_mask_train, y_tf_train)
  test_set = TensorDataset(input_ids_test, attention_mask_test, y_tf_test)

  # Set up dataloader
  train_dl = DataLoader(
      train_set,
      sampler = RandomSampler(train_set),
      batch_size = 32
  )

  test_dl = DataLoader(
      test_set,
      sampler = SequentialSampler(test_set),
      batch_size = 32
  )

  return train_dl, test_dl

## Evaluation Metrics

In [None]:
def calculate_tp(preds, labels):
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def calculate_fp(preds, labels):
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def calculate_tn(preds, labels):
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def calculate_fn(preds, labels):
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = calculate_tp(preds, labels)
  tn = calculate_tn(preds, labels)
  fp = calculate_fp(preds, labels)
  fn = calculate_fn(preds, labels)
  accuracy = (tp + tn) / len(labels)
  precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  return accuracy, precision, recall

## Model

In [None]:
class bert_model :
  def __init__(self, learning_rate, epochs):
    self.learning_rate = learning_rate
    self.epochs = epochs
    
    self.model_name = 'bert-base-uncased'

    self.model = BertForSequenceClassification.from_pretrained(
        self.model_name,
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False,
    )

    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr = self.learning_rate)

    self.model.cuda()

  def train(self, train_dl, test_dl):
    for _ in trange(self.epochs, desc = 'Epoch'):
      # Training process
      # Set to train the model
      self.model.train()

      loss = 0
      steps = 0
      for step, batch in enumerate(train_dl):
        batch_input_ids, batch_input_mask, batch_labels = tuple(t.to('cuda') for t in batch)
        self.optimizer.zero_grad()

        # Forward
        forward_out = self.model(batch_input_ids, 
                            token_type_ids = None, 
                            attention_mask = batch_input_mask, 
                            labels = batch_labels)

        # Backward
        forward_out.loss.backward()
        self.optimizer.step()

        # Loss
        loss += forward_out.loss.item()
        steps += 1

      # Validation process
      # Set to validate the model
      self.model.eval()

      accuracy = []
      precision = []
      recall = []

      for batch in test_dl:
        batch_input_ids, batch_input_mask, batch_labels = tuple(t.to('cuda') for t in batch)
        with torch.no_grad():
          # Forward
          eval_out = self.model(batch_input_ids, 
                            token_type_ids = None, 
                            attention_mask = batch_input_mask)
        logits = eval_out.logits.detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        # Metrics
        batch_accuracy, batch_precision, batch_recall = metrics(logits, label_ids)

        accuracy.append(batch_accuracy)
        if batch_precision != 'nan': precision.append(batch_precision)
        if batch_recall != 'nan': recall.append(batch_recall)

      print('\n\t - Train loss: {:.4f}'.format(loss / steps))
      print('\t - Validation Accuracy: {:.4f}'.format(sum(accuracy)/len(accuracy)))
      print('\t - Validation Precision: {:.4f}'.format(sum(precision)/len(precision)) if len(precision)>0 else '\t - Validation Precision: NaN')
      print('\t - Validation Recall: {:.4f}'.format(sum(recall)/len(recall)) if len(recall)>0 else '\t - Validation Recall: NaN')

## Execute

### Data

In [None]:
input_ids_train, attention_mask_train, y_tf_train, input_ids_test, attention_mask_test, y_tf_test = tokenize(X_train, X_test, 32)
train_dl, test_dl = data_load(input_ids_train, attention_mask_train, y_tf_train, input_ids_test, attention_mask_test, y_tf_test)

### Models

In [None]:
models = [
    {
        'learning_rate' : 5e-5,
        'epochs' : 2
    },
    {
        'learning_rate' : 5e-5,
        'epochs' : 3
    },
    {
        'learning_rate' : 2e-5,
        'epochs' : 2
    },
    {
        'learning_rate' : 2e-5,
        'epochs' : 3
    },
]

### Execute Per Model

In [None]:
print(f"Learning rate : {models[0]['learning_rate']}")
print(f"Epochs : {models[0]['epochs']}")
bert = bert_model(models[0]['learning_rate'], models[0]['epochs'])
bert.train(train_dl, test_dl)

Learning rate : 5e-05
Epochs : 2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


	 - Train loss: 0.3778
	 - Validation Accuracy: 0.8548
	 - Validation Precision: 0.6671
	 - Validation Recall: 0.7281


Epoch: 100%|██████████| 2/2 [04:40<00:00, 140.28s/it]


	 - Train loss: 0.2522
	 - Validation Accuracy: 0.8604
	 - Validation Precision: 0.7602
	 - Validation Recall: 0.5654





In [None]:
print(f"Learning rate : {models[1]['learning_rate']}")
print(f"Epochs : {models[1]['epochs']}")
bert = bert_model(models[1]['learning_rate'], models[1]['epochs'])
bert.train(train_dl, test_dl)

Learning rate : 5e-05
Epochs : 3


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


	 - Train loss: 0.4212
	 - Validation Accuracy: 0.7457
	 - Validation Precision: NaN
	 - Validation Recall: 0.0000


Epoch:  67%|██████▋   | 2/3 [04:54<02:27, 147.47s/it]


	 - Train loss: 0.3178
	 - Validation Accuracy: 0.8434
	 - Validation Precision: 0.6306
	 - Validation Recall: 0.7502


Epoch: 100%|██████████| 3/3 [07:22<00:00, 147.47s/it]


	 - Train loss: 0.2481
	 - Validation Accuracy: 0.8391
	 - Validation Precision: 0.6200
	 - Validation Recall: 0.7697





In [None]:
print(f"Learning rate : {models[2]['learning_rate']}")
print(f"Epochs : {models[2]['epochs']}")
bert = bert_model(models[2]['learning_rate'], models[2]['epochs'])
bert.train(train_dl, test_dl)

Learning rate : 2e-05
Epochs : 2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


	 - Train loss: 0.3907
	 - Validation Accuracy: 0.8587
	 - Validation Precision: 0.7410
	 - Validation Recall: 0.6126


Epoch: 100%|██████████| 2/2 [04:55<00:00, 147.66s/it]


	 - Train loss: 0.2701
	 - Validation Accuracy: 0.8548
	 - Validation Precision: 0.6916
	 - Validation Recall: 0.6698





In [None]:
print(f"Learning rate : {models[3]['learning_rate']}")
print(f"Epochs : {models[3]['epochs']}")
bert = bert_model(models[3]['learning_rate'], models[3]['epochs'])
bert.train(train_dl, test_dl)

Learning rate : 2e-05
Epochs : 3


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


	 - Train loss: 0.3892
	 - Validation Accuracy: 0.8097
	 - Validation Precision: 0.5622
	 - Validation Recall: 0.8274


Epoch:  67%|██████▋   | 2/3 [04:55<02:27, 147.72s/it]


	 - Train loss: 0.2735
	 - Validation Accuracy: 0.8622
	 - Validation Precision: 0.7099
	 - Validation Recall: 0.6526


Epoch: 100%|██████████| 3/3 [07:23<00:00, 147.71s/it]


	 - Train loss: 0.2002
	 - Validation Accuracy: 0.8157
	 - Validation Precision: 0.5699
	 - Validation Recall: 0.8265





# Conclusion

Dari keempat model eksperimen, dapat disimpulkan bahwa model eksperimen terbaik adalah model dengan learning rate 5 x 10^-5 dan epochs sebanyak 3, yaitu dengan hasil metrics evaluasi :
* Accuracy = 0.8391
* Precision = 0.6200
* Recall = 0.7697


# References

https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894