In [1]:
# to supress the output here
%%capture 

# install prerequisites
!pip install simpletransformers
!pip install sentence_transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd, numpy as np
import sklearn
import seaborn as sns

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report, confusion_matrix

## Prerequisites:

The cell below allows the user to specify which transformer model they need to run and whre the data is located. \
The user should also select which training data to use for either in-domain or cross-domain testing.

In [9]:
# set the correct model (should already be good but choose from: "diptanu/fBERT", "bert-base-uncased")
model_name = "bert-base-uncased"
domain = 'cross' # or select 'in'

# set here the folder that contains the data folder (which in turn contains all the required datasets)
dir = '/content/drive/MyDrive/subjectivity_mining/'


In [10]:
if domain == 'cross':
    # cross domain HASOC data
    train_df = pd.read_csv(f'{dir}data/hasoc-train-small.csv') # for fine-tuning
    dev_df = pd.read_csv(f'{dir}data/hasoc-dev.csv') # for fine-tuning
    full_train_df = pd.read_csv(f'{dir}data/hasoc-train-all.csv')
else:
    # indomain OLID data
    train_df = pd.read_csv(f'{dir}data/olid-train-small.csv') # for fine-tuning
    dev_df = pd.read_csv(f'{dir}data/olid-dev.csv') # for fine-tuning
    full_train_df = pd.read_csv(f'{dir}data/olid-train-all.csv')

# general test data
test_df = pd.read_csv(f'{dir}data/olid-test.csv')
# where to store the final model and intermediate results
save_dir = f'{dir}outputs/' 

# Fine-tuning

The cell below runs the fine-tuning of the models.\
As of yet, only learning rate is adjusted however, to avoid running unneccesarily long epoch, we set an early stopping parameter. This will avoid our model from over training when the performance is contiunuously decreasing.

A manual seed is set. This is done to avoid having to do many iterations to acocunt for the stochastic nature of deep learning models. We are constrain by time and a faster option is to only use one seed. The seed was not specifically selected for its perfomance.

During this setup, only the final model is saved (however not used).
The outcome is shown in the cell below using MCC scores. The highest performing model is used for the final model's training

In [None]:
# hyperparameter tuning
# run this cell and after its done the next cell. This will take a while (15-30 minutes)
learning_rates = [2e-5, 3e-5, 4e-5]


for lr in learning_rates:
    model_args = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 5,
      'learning_rate': lr,
      'adam_epsilon': 1e-8,
      'output_dir': f'{save_dir}/temp_{model_name}_{domain}_model_{lr}',
      'overwrite_output_dir': True,
      'manual_seed': 123,
      'use_early_stopping': True,
      'evaluate_during_training': True,
      'evaluate_during_training_verbose': True,
      'save_model_every_epoch': False,
      'save_steps': -1,
      'no_save': True
    }
    # Train the model
    model = ClassificationModel("bert", model_name, args=model_args, use_cuda=True)
    model.train_model(train_df, eval_df=dev_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(dev_df)


In [None]:
# pick best performing model for final training (highest MCC score)
full_results = pd.DataFrame()
for lr in [2e-5, 3e-5, 4e-5]:
    partial_result = pd.read_csv(f'{save_dir}/temp_{model_name}_model_{lr}/training_progress_scores.csv')
    partial_result = partial_result[partial_result.global_step != 2000]
    partial_result['Learning Rate'] = str(lr)
    partial_result['Epochs'] = np.arange(len(partial_result))
    full_results = pd.concat([full_results, partial_result])

full_results.reset_index(inplace=True)

sns.set_theme(style="darkgrid")
# Plot the f-score per epoch for the different number of hidden nodes
sns.relplot(x="Epochs", y="mcc", data=full_results, hue='Learning Rate', 
            ci=None, marker='o', palette='bright', kind="line"
            )

## Final model training and storing

The final section  of the code fine-tunes the model with the full training data and stores the resulting model on the drive.\
It also shows the classification report and confusion matrix using the final test data.

In [None]:
# set correct parameters based on highest perfomring model above (set #epochs and learning rate)
model_args = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 2,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 4e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'output_dir': f'{save_dir}/final_{model_name}_{domain}_model',
      'overwrite_output_dir': True,
      'manual_seed': 123,
      'save_model_every_epoch': False,
      'save_steps': -1
    }
# Train the model
model = ClassificationModel("bert", model_name, args=model_args, use_cuda=True)
model.train_model(full_train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)

# Make predictions with the model
predictions, raw_outputs = model.predict(list(test_df.text))
print(classification_report(test_df.labels, predictions))

In [None]:
confusion_matrix(test_df.labels, predictions)