## Preparation

Install and call all necessary packages.

In [None]:
!pip install datasets
!pip install transformers[torch]
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoConfig
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import EvalPrediction
import pandas as pd
from scipy import stats
from statistics import mean
import numpy as np
from datasets import DatasetDict, Dataset, Features, ClassLabel, Value
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
import json

from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:

Define functions.

In [None]:
def label_maker(x):
  if 'ind' in x:
    return 'eval_individual'
  elif x == 'generic_val' or x == 'social' or x == 'aesthetic':
    return 'eval_generic'
  else:
    return x

def label_maker_binary(x):
  if x != 'no_val':
    return 'val'
  else:
    return x

def tokenize_function(example):
    return tokenizer(example["sentence"], max_length=256, truncation=True, padding="max_length")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    mf1 = f1_score(labels, preds, average='macro')
    wf1 = f1_score(labels, preds, average='weighted')
    return {
        'mf1': mf1,
        'wf1': wf1,
    }

def predict_text_class(input_text, labels, model, tokenizer):

    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        output = model(**input_ids)

    predicted_labels = output.logits.argmax(dim=1)

    return labels[predicted_labels.item()]

Simplify tagset.  
Two options available:
- binary
- three classes

In [None]:
# selecting simplification strategy
my_strategy = "binary"

#uploading and concatenating the two curation datasets
df = pd.read_excel('/content/drive/MyDrive/CHR2024/Curation/curation.xlsx')[['sentence','label']]

if(my_strategy == "binary"):
  df['label'] = df['label'].map(label_maker_binary)
if(my_strategy == "three classes"):
  df['label'] = df['label'].map(label_maker)

# define set of labels
labels = list(set(df['label']))

In [None]:
# Add ID to df for review selection
df['book_title'] = df['book_title'].fillna('')  # Replace NaN with empty string
df["ID"] = df["rev_id"].astype(str) + df["book_title"]

# Read selected reviews
file_path = "/content/drive/MyDrive/Curation/selected_reviews_for_GPT.txt"
with open(file_path, "r") as file:
    lines = file.readlines()

random_elements = [line.strip() for line in lines]

# Filter dataset to create test set from random selection of IDs
test = df[df['ID'].isin(random_elements)][["sentence", "label"]]
train_set = df[-df['ID'].isin(random_elements)][["sentence", "label"]]

# show stats
print(f'Proportion test/train = {len(test)/len(train_set)}')
for my_set in [train_set, test]:
  print(len(my_set))
  for label in labels:
    print(f'label = {label}, count = {my_set["label"].tolist().count(label)}, prop = {my_set["label"].tolist().count(label)/len(my_set)}')


Proportion test/train = 0.250051964248597
4811
label = val, count = 1397, prop = 0.290376221159842
label = no_val, count = 3414, prop = 0.7096237788401579
1203
label = val, count = 349, prop = 0.29010806317539484
label = no_val, count = 854, prop = 0.7098919368246052


## Train model

In [None]:
# finetune the pretrained model

#option 0: google-bert/bert-base-uncased
#option 1: LiYuan/amazon-review-sentiment-analysis
#option 2: JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books

checkpoint = "google-bert/bert-base-uncased"

# Define batch size
batch_size = 12

training_args = TrainingArguments("/content/drive/MyDrive/CHR2024/Transformer_models/"+my_strategy,
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=2,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    overwrite_output_dir=True,
    save_strategy="epoch",
    metric_for_best_model='wf1',
    weight_decay=0.01,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

#split dataset into train and validation sets
train_ratio = 0.9
validation_ratio = 0.1

train, val, y_train, y_val = train_test_split(train_set, train_set['label'], test_size=1-train_ratio, random_state=42)

#create datasets
dataset_train = Dataset.from_pandas(train, features=Features({"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset = DatasetDict([("train", dataset_train)])
dataset_val = Dataset.from_pandas(val, features=Features(
    {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
dataset['val'] = dataset_val

# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#move the model to 'cuda' to leverage GPU during the finetuning
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#tokenize the train and evaluation set
tokenized_train = dataset['train'].map(tokenize_function, batched=True)
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

tokenized_val = dataset['val'].map(tokenize_function, batched=True)
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

model.to(device)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics,
)

trainer.train()

#switch teh model back to cpu, otherwise (I don't know why) it doesn't do the prediction
model.to('cpu')

# make predictions
true_labels = []
predicted_labels = []

for i in range(test.shape[0]):
  sentence = test.iloc[i,0]
  true_labels.append(test.iloc[i,1])
  predicted_labels.append(predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer))

# print and save report
report = classification_report(true_labels,predicted_labels,digits=3)
print(report)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.340426,0.828054,0.855334
2,0.414700,0.344105,0.844947,0.869677


              precision    recall  f1-score   support

      no_val      0.898     0.937     0.917       854
         val      0.827     0.739     0.781       349

    accuracy                          0.879      1203
   macro avg      0.862     0.838     0.849      1203
weighted avg      0.877     0.879     0.877      1203



## Load and test trained model

Load the saved model.

In [None]:
# Define checkpoint for tokenizer
checkpoint = "google-bert/bert-base-uncased"

# Define the path to your checkpoint
checkpoint_path = "/content/drive/MyDrive/CHR2024/Transformer_models/"+my_strategy+"/checkpoint-722"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load the model
model_new = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# Ensure the model is in evaluation mode
model_new.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Test it on the selected test dataset.

In [None]:
# test the model
true_labels = []
predicted_labels = []

if my_strategy == "binary":
  labels = ["val", "no_val"]

if my_strategy == "three_classes":
  labels = ["no_val", "eval_generic", "eval_individual"]

for i in range(test.shape[0]):
  sentence = test.iloc[i,0]
  true_labels.append(test.iloc[i,1])
  predicted_labels.append(predict_text_class(sentence, labels=labels, model=model_new, tokenizer=tokenizer))

# print report
report = classification_report(true_labels,predicted_labels,digits=3)
print(report)

              precision    recall  f1-score   support

      no_val      0.895     0.933     0.913       854
         val      0.817     0.731     0.772       349

    accuracy                          0.874      1203
   macro avg      0.856     0.832     0.843      1203
weighted avg      0.872     0.874     0.872      1203

