## Preparation

Install and call all necessary packages.

In [None]:
!pip install datasets
!pip install transformers[torch]
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoConfig
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import EvalPrediction
import pandas as pd
from scipy import stats
from statistics import mean
import numpy as np
from datasets import DatasetDict, Dataset, Features, ClassLabel, Value
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
import json

from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

Define functions.

In [None]:
def label_maker(x):
  if 'ind' in x:
    return 'eval_individual'
  elif x == 'generic_val' or x == 'social' or x == 'aesthetic':
    return 'eval_generic'
  else:
    return x

def label_maker_binary(x):
  if x != 'no_val':
    return 'val'
  else:
    return x

def tokenize_function(example):
    return tokenizer(example["sentence"], max_length=256, truncation=True, padding="max_length")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    mf1 = f1_score(labels, preds, average='macro')
    wf1 = f1_score(labels, preds, average='weighted')
    return {
        'mf1': mf1,
        'wf1': wf1,
    }

def predict_text_class(input_text, labels, model, tokenizer):

    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        output = model(**input_ids)

    predicted_labels = output.logits.argmax(dim=1)

    return labels[predicted_labels.item()]

Simplify tagset.  
Two options available:
- binary
- three classes

In [None]:
# selecting simplification strategy
my_strategy = "three classes"

#uploading and concatenating the two curation datasets
df = pd.read_excel('/content/drive/MyDrive/CHR2024/Curation/curation.xlsx')[['sentence','label']]

if(my_strategy == "binary"):
  df['label'] = df['label'].map(label_maker_binary)
if(my_strategy == "three classes"):
  df['label'] = df['label'].map(label_maker)

Show label usages.

In [None]:
labels = list(set(df['label']))
for label in labels:
  print(f'label = {label}, count = {df["label"].tolist().count(label)}')

label = eval_generic, count = 1302
label = no_val, count = 4268
label = eval_individual, count = 444


## Train model (with learning curve)

In [None]:
# here you define from which point to start the learning curve
start_point = 10 # here we perform just the last iteration

# prepare data splits
data_splits = [x*600 for x in range(start_point,11)]

# finetune the pretrained model

#option 0: google-bert/bert-base-uncased
#option 1: LiYuan/amazon-review-sentiment-analysis
#option 2: JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books

checkpoint = "google-bert/bert-base-uncased"

batch_size = 12

training_args = TrainingArguments("/content",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=2,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    overwrite_output_dir=True,
    save_strategy="epoch",
    metric_for_best_model='wf1',
    weight_decay=0.01,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

# randomize dataset
df_random = df.sample(frac=1).reset_index(drop=True)

# Start writing output file with simplification strategy
with open('/content/drive/MyDrive/CHR2024/model_results_learning_curve.txt', 'a') as f:
  f.write('# Approach '+str(my_strategy)+"\n\n")

for my_split in data_splits:

  print("\n\n########### SPLIT\n########### SPLIT\n", my_split, "########### SPLIT\n########### SPLIT\n\n")

  # split the dataset
  df_split = df_random[:my_split]

  # Define the number of splits (k)
  k = 5

  # Initialize KFold with the number of splits
  kf = KFold(n_splits=k, shuffle=True, random_state=42)

  # Initialize an empty variables to store the efficiency
  accuracy = []
  macro_f1 = []
  weighted_f1 = []
  iteration = 0

  # Start writing output file with name of split
  with open('/content/drive/MyDrive/CHR2024/model_results_learning_curve.txt', 'a') as f:
    f.write('## Split '+str(my_split)+"\n\n")

  # Iterate through the splits
  for train_index, test_index in kf.split(df_split):

      print("\n\n###########\n###########\n", iteration, "###########\n###########\n\n")

      # train_index and test_index contain the indices for train and test sets for each split
      train_set = df_split.iloc[train_index]
      test = df_split.iloc[test_index]

      #split dataset into train and validation sets
      train_ratio = 0.9
      validation_ratio = 0.1

      train, val, y_train, y_val = train_test_split(train_set, train_set['label'], test_size=1-train_ratio, random_state=42)

      #create datasets
      dataset_train = Dataset.from_pandas(train, features=Features(
          {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
      dataset = DatasetDict([("train", dataset_train)])
      dataset_val = Dataset.from_pandas(val, features=Features(
          {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
      dataset['val'] = dataset_val

      # load model and tokenizer
      model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
      tokenizer = AutoTokenizer.from_pretrained(checkpoint)
      data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
      #move the model to 'cuda' to leverage GPU during the finetuning
      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      model.to(device)

      #tokenize the train and evaluation set
      tokenized_train = dataset['train'].map(tokenize_function, batched=True)
      tokenized_train = tokenized_train.rename_column("label", "labels")
      tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

      tokenized_val = dataset['val'].map(tokenize_function, batched=True)
      tokenized_val = tokenized_val.rename_column("label", "labels")
      tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

      # Finetuning

      trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_train,
      eval_dataset=tokenized_val,
      compute_metrics=compute_metrics,
      )

      trainer.train()

      #switch teh model back to cpu, otherwise (I don't know why) it doesn't do the prediction
      model.to('cpu')

      # make predictions
      true_labels = []
      predicted_labels = []

      for i in range(test.shape[0]):
        sentence = test.iloc[i,0]
        true_labels.append(test.iloc[i,1])
        predicted_labels.append(predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer))

      # print and save report
      report = classification_report(true_labels,predicted_labels,digits=3)
      print(report)

      with open('/content/drive/MyDrive/CHR2024/model_results_learning_curve.txt', 'a') as f:
        f.write('### Fold '+str(iteration)+'\n')
        f.write(report)
        f.write('\n\n')

      # get general stats
      accuracy.append(float(report.split('\n')[-4].split()[1]))
      macro_f1.append(float(report.split('\n')[-3].split()[4]))
      weighted_f1.append(float(report.split('\n')[-2].split()[4]))
      iteration += 1

  with open('/content/drive/MyDrive/CHR2024/model_results_learning_curve.txt', 'a') as f:
    f.write('### Mean scores\n\n')
    f.write('Accuracy: '+str(round(sum(accuracy) / len(accuracy), 3))+'\n')
    f.write('F1-macro: '+str(round(sum(macro_f1) / len(macro_f1), 3))+'\n')
    f.write('F1-weighted: '+str(round(sum(weighted_f1) / len(weighted_f1), 3))+'\n\n\n')

  full_report = {'approach': my_strategy, 'split': my_split, 'accuracy': accuracy, 'F1-macro': macro_f1, 'F1-weighted': weighted_f1}

  # Open the file in append mode
  with open('/content/drive/MyDrive/CHR2024/model_results_learning_curve.json', 'a') as file:
      # Convert dictionary to JSON string
      json_data = json.dumps(full_report)
      # Write JSON string to file
      file.write(json_data + '\n')




########### SPLIT
########### SPLIT
 6000 ########### SPLIT
########### SPLIT




###########
###########
 0 ###########
###########




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.474826,0.542913,0.811012
2,0.571200,0.440207,0.574257,0.825965


                 precision    recall  f1-score   support

   eval_generic      0.601     0.827     0.696       248
eval_individual      1.000     0.013     0.026        76
         no_val      0.917     0.898     0.908       876

       accuracy                          0.828      1200
      macro avg      0.839     0.579     0.543      1200
   weighted avg      0.857     0.828     0.808      1200



###########
###########
 1 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.412928,0.516592,0.801978
2,0.569800,0.424698,0.518175,0.802883


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

   eval_generic      0.593     0.770     0.670       252
eval_individual      0.000     0.000     0.000        93
         no_val      0.899     0.918     0.909       855

       accuracy                          0.816      1200
      macro avg      0.497     0.563     0.526      1200
   weighted avg      0.765     0.816     0.788      1200



###########
###########
 2 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.551219,0.510499,0.756453
2,0.559800,0.557448,0.52051,0.767486


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

   eval_generic      0.629     0.808     0.708       292
eval_individual      0.000     0.000     0.000        81
         no_val      0.896     0.894     0.895       827

       accuracy                          0.812      1200
      macro avg      0.508     0.567     0.534      1200
   weighted avg      0.770     0.812     0.789      1200



###########
###########
 3 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.527758,0.507178,0.770465
2,0.574200,0.502756,0.519839,0.781376


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

   eval_generic      0.590     0.763     0.665       232
eval_individual      0.000     0.000     0.000        98
         no_val      0.893     0.924     0.908       870

       accuracy                          0.818      1200
      macro avg      0.494     0.562     0.525      1200
   weighted avg      0.762     0.818     0.787      1200



###########
###########
 4 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.423347,0.560539,0.836667
2,0.566000,0.411658,0.557279,0.831298


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

   eval_generic      0.599     0.757     0.669       272
eval_individual      0.000     0.000     0.000        94
         no_val      0.879     0.902     0.890       834

       accuracy                          0.798      1200
      macro avg      0.492     0.553     0.520      1200
   weighted avg      0.746     0.798     0.770      1200

