## Preparation

Install and call all necessary packages.

In [None]:
!pip install datasets
!pip install transformers[torch]
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoConfig
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import EvalPrediction
import pandas as pd
from scipy import stats
from statistics import mean
import numpy as np
from datasets import DatasetDict, Dataset, Features, ClassLabel, Value
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
import json

from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

Define functions.

In [None]:
def label_maker(x):
  if 'ind' in x:
    return 'eval_individual'
  elif x == 'generic_val' or x == 'social' or x == 'aesthetic':
    return 'eval_generic'
  else:
    return x

def label_maker_binary(x):
  if x != 'no_val':
    return 'val'
  else:
    return x

def tokenize_function(example):
    return tokenizer(example["sentence"], max_length=256, truncation=True, padding="max_length")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    mf1 = f1_score(labels, preds, average='macro')
    wf1 = f1_score(labels, preds, average='weighted')
    return {
        'mf1': mf1,
        'wf1': wf1,
    }

def predict_text_class(input_text, labels, model, tokenizer):

    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        output = model(**input_ids)

    predicted_labels = output.logits.argmax(dim=1)

    return labels[predicted_labels.item()]

Simplify tagset.  
Two options available:
- binary
- three classes

In [None]:
# selecting simplification strategy
my_strategy = "binary"

#uploading and concatenating the two curation datasets
df = pd.read_excel('/content/drive/MyDrive/CHR2024/Curation/curation.xlsx')[['sentence','label']]

if(my_strategy == "binary"):
  df['label'] = df['label'].map(label_maker_binary)
if(my_strategy == "three classes"):
  df['label'] = df['label'].map(label_maker)

Show label usages.

In [None]:
labels = list(set(df['label']))
for label in labels:
  print(f'label = {label}, count = {df["label"].tolist().count(label)}')

label = val, count = 1746
label = no_val, count = 4268


## Train model

In [None]:
# finetune the pretrained model

#option 0: google-bert/bert-base-uncased
#option 1: LiYuan/amazon-review-sentiment-analysis
#option 2: JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books

all_models = ["google-bert/bert-base-uncased", "LiYuan/amazon-review-sentiment-analysis", "JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books"]

# Define the number of splits (k)
k = 5

# Define batch size
batch_size = 12

training_args = TrainingArguments("/content",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=2,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    overwrite_output_dir=True,
    save_strategy="epoch",
    metric_for_best_model='wf1',
    weight_decay=0.01,
    load_best_model_at_end=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

# Initialize KFold with the number of splits
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Start writing output file with simplification strategy
with open('/content/drive/MyDrive/CHR2024/model_results_cv.txt', 'a') as f:
  f.write('# Approach '+my_strategy+"\n\n")

for checkpoint in all_models:

  print("\n\n########### MODEL\n########### MODEL\n", checkpoint, "########### MODEL\n########### MODEL\n\n")

  # Initialize an empty variables to store the efficiency
  accuracy = []
  macro_f1 = []
  weighted_f1 = []
  iteration = 0

  # Start writing output file with name of model
  with open('/content/drive/MyDrive/CHR2024/model_results_cv.txt', 'a') as f:
    f.write('## Model '+checkpoint+"\n\n")

  # Iterate through the splits
  for train_index, test_index in kf.split(df):

      print("\n\n###########\n###########\n", iteration, "###########\n###########\n\n")

      # train_index and test_index contain the indices for train and test sets for each split
      train_set = df.iloc[train_index]
      test = df.iloc[test_index]

      #split dataset into train and validation sets
      train_ratio = 0.9
      validation_ratio = 0.1

      train, val, y_train, y_val = train_test_split(train_set, train_set['label'], test_size=1-train_ratio, random_state=42)

      #create datasets
      dataset_train = Dataset.from_pandas(train, features=Features(
          {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
      dataset = DatasetDict([("train", dataset_train)])
      dataset_val = Dataset.from_pandas(val, features=Features(
          {"sentence": Value(dtype='string'), "label": ClassLabel(names=labels)}), preserve_index=False)
      dataset['val'] = dataset_val

      # load model and tokenizer
      model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3, ignore_mismatched_sizes=True)
      tokenizer = AutoTokenizer.from_pretrained(checkpoint)
      data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
      #move the model to 'cuda' to leverage GPU during the finetuning
      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      model.to(device)

      #tokenize the train and evaluation set
      tokenized_train = dataset['train'].map(tokenize_function, batched=True)
      tokenized_train = tokenized_train.rename_column("label", "labels")
      tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

      tokenized_val = dataset['val'].map(tokenize_function, batched=True)
      tokenized_val = tokenized_val.rename_column("label", "labels")
      tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

      # Finetuning

      trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_train,
      eval_dataset=tokenized_val,
      compute_metrics=compute_metrics,
      )

      trainer.train()

      #switch teh model back to cpu, otherwise (I don't know why) it doesn't do the prediction
      model.to('cpu')

      # make predictions
      true_labels = []
      predicted_labels = []

      for i in range(test.shape[0]):
        sentence = test.iloc[i,0]
        true_labels.append(test.iloc[i,1])
        predicted_labels.append(predict_text_class(sentence, labels=labels, model=model, tokenizer=tokenizer))

      # print and save report
      report = classification_report(true_labels,predicted_labels,digits=3)
      print(report)

      with open('/content/drive/MyDrive/CHR2024/model_results_cv.txt', 'a') as f:
        f.write('### Fold '+str(iteration)+'\n')
        f.write(report)
        f.write('\n\n')

      # get general stats
      accuracy.append(float(report.split('\n')[-4].split()[1]))
      macro_f1.append(float(report.split('\n')[-3].split()[4]))
      weighted_f1.append(float(report.split('\n')[-2].split()[4]))
      iteration += 1

  with open('/content/drive/MyDrive/CHR2024/model_results_cv.txt', 'a') as f:
    f.write('### Mean scores\n\n')
    f.write('Accuracy: '+str(round(sum(accuracy) / len(accuracy), 3))+'\n')
    f.write('F1-macro: '+str(round(sum(macro_f1) / len(macro_f1), 3))+'\n')
    f.write('F1-weighted: '+str(round(sum(weighted_f1) / len(weighted_f1), 3))+'\n\n\n')




########### MODEL
########### MODEL
 google-bert/bert-base-uncased ########### MODEL
########### MODEL




###########
###########
 0 ###########
###########




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.405491,0.757335,0.808107
2,0.404400,0.357812,0.829561,0.859325


              precision    recall  f1-score   support

      no_val      0.863     0.924     0.893       819
         val      0.810     0.688     0.744       384

    accuracy                          0.849      1203
   macro avg      0.836     0.806     0.818      1203
weighted avg      0.846     0.849     0.845      1203



###########
###########
 1 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.374389,0.805712,0.833717
2,0.406100,0.385466,0.831649,0.857234


              precision    recall  f1-score   support

      no_val      0.888     0.908     0.898       849
         val      0.767     0.726     0.746       354

    accuracy                          0.855      1203
   macro avg      0.828     0.817     0.822      1203
weighted avg      0.853     0.855     0.853      1203



###########
###########
 2 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.343856,0.801775,0.835628
2,0.432700,0.315314,0.835943,0.861133


              precision    recall  f1-score   support

      no_val      0.902     0.924     0.913       860
         val      0.798     0.749     0.773       343

    accuracy                          0.874      1203
   macro avg      0.850     0.837     0.843      1203
weighted avg      0.873     0.874     0.873      1203



###########
###########
 3 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.388837,0.790348,0.832159
2,0.427200,0.39681,0.804842,0.847497


              precision    recall  f1-score   support

      no_val      0.922     0.900     0.911       861
         val      0.762     0.807     0.784       342

    accuracy                          0.874      1203
   macro avg      0.842     0.854     0.847      1203
weighted avg      0.876     0.874     0.875      1203



###########
###########
 4 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4330 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.355036,0.806581,0.852154
2,0.431300,0.378793,0.795877,0.845018


              precision    recall  f1-score   support

      no_val      0.910     0.902     0.906       879
         val      0.740     0.759     0.749       323

    accuracy                          0.864      1202
   macro avg      0.825     0.830     0.828      1202
weighted avg      0.865     0.864     0.864      1202



########### MODEL
########### MODEL
 LiYuan/amazon-review-sentiment-analysis ########### MODEL
########### MODEL




###########
###########
 0 ###########
###########






config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at LiYuan/amazon-review-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.415521,0.756621,0.806926
2,0.408600,0.382523,0.815101,0.847079


              precision    recall  f1-score   support

      no_val      0.869     0.908     0.888       819
         val      0.784     0.708     0.744       384

    accuracy                          0.845      1203
   macro avg      0.827     0.808     0.816      1203
weighted avg      0.842     0.845     0.842      1203



###########
###########
 1 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at LiYuan/amazon-review-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.390274,0.798663,0.830524
2,0.407900,0.412905,0.816127,0.843351


              precision    recall  f1-score   support

      no_val      0.879     0.900     0.889       849
         val      0.746     0.703     0.724       354

    accuracy                          0.842      1203
   macro avg      0.812     0.802     0.807      1203
weighted avg      0.840     0.842     0.841      1203



###########
###########
 2 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at LiYuan/amazon-review-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.334651,0.83574,0.86006
2,0.414000,0.342855,0.825933,0.851706


              precision    recall  f1-score   support

      no_val      0.876     0.906     0.891       860
         val      0.742     0.679     0.709       343

    accuracy                          0.841      1203
   macro avg      0.809     0.793     0.800      1203
weighted avg      0.838     0.841     0.839      1203



###########
###########
 3 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at LiYuan/amazon-review-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.371338,0.792112,0.836293
2,0.423700,0.426449,0.786013,0.831921


              precision    recall  f1-score   support

      no_val      0.917     0.891     0.904       861
         val      0.744     0.798     0.770       342

    accuracy                          0.865      1203
   macro avg      0.831     0.845     0.837      1203
weighted avg      0.868     0.865     0.866      1203



###########
###########
 4 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at LiYuan/amazon-review-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4330 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.38571,0.771659,0.827566
2,0.418500,0.440973,0.783633,0.837273


              precision    recall  f1-score   support

      no_val      0.903     0.904     0.904       879
         val      0.739     0.737     0.738       323

    accuracy                          0.859      1202
   macro avg      0.821     0.821     0.821      1202
weighted avg      0.859     0.859     0.859      1202



########### MODEL
########### MODEL
 JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books ########### MODEL
########### MODEL




###########
###########
 0 ###########
###########






config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.382281,0.781466,0.825898
2,0.418000,0.342603,0.838245,0.866084


              precision    recall  f1-score   support

      no_val      0.871     0.915     0.892       819
         val      0.796     0.711     0.751       384

    accuracy                          0.850      1203
   macro avg      0.833     0.813     0.822      1203
weighted avg      0.847     0.850     0.847      1203



###########
###########
 1 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.353725,0.82457,0.851094
2,0.412600,0.37205,0.828831,0.854979


              precision    recall  f1-score   support

      no_val      0.892     0.902     0.897       849
         val      0.759     0.737     0.748       354

    accuracy                          0.854      1203
   macro avg      0.825     0.820     0.822      1203
weighted avg      0.853     0.854     0.853      1203



###########
###########
 2 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.314105,0.833792,0.859182
2,0.420100,0.306232,0.830837,0.855883


              precision    recall  f1-score   support

      no_val      0.887     0.921     0.904       860
         val      0.781     0.706     0.741       343

    accuracy                          0.860      1203
   macro avg      0.834     0.813     0.822      1203
weighted avg      0.857     0.860     0.857      1203



###########
###########
 3 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.416326,0.780957,0.823096
2,0.415700,0.43392,0.791462,0.838301


              precision    recall  f1-score   support

      no_val      0.914     0.905     0.910       861
         val      0.766     0.787     0.776       342

    accuracy                          0.871      1203
   macro avg      0.840     0.846     0.843      1203
weighted avg      0.872     0.871     0.872      1203



###########
###########
 4 ###########
###########




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at JoelVIU/bert-base-uncased-finetuned-amazon_reviews_books and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4330 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mf1,Wf1
1,No log,0.390393,0.768463,0.816376
2,0.424500,0.418757,0.799036,0.846181


              precision    recall  f1-score   support

      no_val      0.914     0.896     0.905       879
         val      0.732     0.771     0.751       323

    accuracy                          0.863      1202
   macro avg      0.823     0.834     0.828      1202
weighted avg      0.865     0.863     0.864      1202

