## 1. Preparation

**Before you start this notebook,** you need to:
- Create a shortcut of the "CMCLS" folder in your Google Drive (so you can access the curated annotations)
- Verify that the Notebook uses a GPU (in the menu, "Runtime" -> "Change runtime type" and select any available Hardware accelerator
with a GPU)
- Create an account at https://wandb.ai/site ("Signup" -> "Signup with Google", and select "Academic/University" account type; you can immediately downgrade to a Free account); Login and keep the window open

Once everything is ready, you can start preparing the machine learning procedure.  
**First step:** Install all libraries  
*(Note that you will have to authorize connecting the Notebook to your Google Drive)*

In [None]:
!pip install datasets
!pip install transformers[torch]
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoConfig
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import EvalPrediction
import pandas as pd
from scipy import stats
from statistics import mean
import numpy as np
from datasets import DatasetDict, Dataset, Features, ClassLabel, Value
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
import json
import os

from google.colab import drive
drive.mount('/content/drive')

**Second step:** Define functions

In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence"], max_length=256, truncation=True, padding="max_length")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    mf1 = f1_score(labels, preds, average='macro')
    wf1 = f1_score(labels, preds, average='weighted')
    return {
        'mf1': mf1,
        'wf1': wf1,
    }

def predict_text_class(input_text, labels, model, tokenizer):

    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        output = model(**input_ids)

    predicted_labels = output.logits.argmax(dim=1)

    return labels[predicted_labels.item()]

**Third step:** Read the dataset and show it

In [None]:
# read the dataset
df = pd.read_excel("/content/drive/MyDrive/CMCLS/curation.xlsx")

# list labels
labels = list(set(df['curation']))
for label in labels:
  print(f'curation = {label}, count = {df["curation"].tolist().count(label)}')

# show data
df = df[['sentence', 'curation']]
df = df.rename(columns={'curation': 'label'})
df['sentence'] = df['sentence'].astype('string')
df

## 2. Training and testing

### 2.1 Train base model

Let's start by training a basic model, [*google-bert/bert-base-uncased*](https://huggingface.co/google-bert/bert-base-uncased)

In [None]:
# choose the pretrained model
checkpoint = "google-bert/bert-base-uncased"

# define all training arguments
training_args = TrainingArguments("/content",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    overwrite_output_dir=True,
    save_strategy="epoch",
    metric_for_best_model='wf1',
    weight_decay=0.01,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

# split dataset into train and test
# we select "Group A" as test
train_set = df[98:]
test = df[:98]

# split train set into train and validation sets
train, val, y_train, y_val = train_test_split(train_set, train_set['label'], test_size=0.1, random_state=42)

# create datasets
dataset_train = Dataset.from_pandas(train, features=Features({"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset = DatasetDict([("train", dataset_train)])
dataset_val = Dataset.from_pandas(val, features=Features(
    {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset['val'] = dataset_val

# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# move the model to 'cuda' to leverage GPU during the finetuning
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# tokenize the train and evaluation set
tokenized_train = dataset['train'].map(tokenize_function, batched=True)
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

tokenized_val = dataset['val'].map(tokenize_function, batched=True)
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# send the model to the device (cuda)
model.to(device)

# define the trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics,
)

# train the model
trainer.train()

# switch the model back to cpu, otherwise (I don't know why) it doesn't do the prediction
model.to('cpu')

# make predictions
true_labels = []
predicted_labels = []

for i in range(test.shape[0]):
  sentence = test.iloc[i,0]
  true_labels.append(test.iloc[i,1])
  predicted_labels.append(predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer))

# print and save report
report = classification_report(true_labels,predicted_labels,digits=3)
print(report)

### 2.2 Finetune pretrained model

Then we can finetune a model already trained for sentiment analysis:  
[*cardiffnlp/twitter-roberta-base-sentiment-latest*](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)

In [None]:
# choose the pretrained model
checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# define all training arguments
training_args = TrainingArguments("/content",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    overwrite_output_dir=True,
    save_strategy="epoch",
    metric_for_best_model='wf1',
    weight_decay=0.01,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

# split dataset into train and test
# we select "Group A" as test
train_set = df[98:]
test = df[:98]

# split train set into train and validation sets
train, val, y_train, y_val = train_test_split(train_set, train_set['label'], test_size=0.1, random_state=42)

# create datasets
dataset_train = Dataset.from_pandas(train, features=Features({"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset = DatasetDict([("train", dataset_train)])
dataset_val = Dataset.from_pandas(val, features=Features(
    {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset['val'] = dataset_val

# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# move the model to 'cuda' to leverage GPU during the finetuning
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# tokenize the train and evaluation set
tokenized_train = dataset['train'].map(tokenize_function, batched=True)
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

tokenized_val = dataset['val'].map(tokenize_function, batched=True)
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# send the model to the device (cuda)
model.to(device)

# define the trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics,
)

# train the model
trainer.train()

# switch the model back to cpu, otherwise (I don't know why) it doesn't do the prediction
model.to('cpu')

# make predictions
true_labels = []
predicted_labels = []

for i in range(test.shape[0]):
  sentence = test.iloc[i,0]
  true_labels.append(test.iloc[i,1])
  predicted_labels.append(predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer))

# print and save report
report = classification_report(true_labels,predicted_labels,digits=3)
print(report)

### 2.3 Apply the model

Finally, we can try the model on a new sentence.

In [None]:
sentence = "I liked this book"

predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer)