# Imports

In [1]:
!pip install safetensors --quiet
!pip install wandb --quiet 
!pip install optuna --quiet

[0m

In [2]:
!pip install pandas --quiet
!pip install matplotlib --quiet
!pip install transformers --quiet
!pip install scikit-learn --quiet
!pip install pyarrow --quiet
!pip install transformers[torch] --quiet
!pip install accelerate --quiet

[0m

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshynkarov-pn[0m ([33mshynkarov-pn-ukrainian-catholic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
wandb.init(
    project="ukrainian-sentiment",  # Name your project
    name="roberta-ukrainian-sentiment",  # Optional run name
    tags=["roberta", "ukrainian", "sentiment"],  # Optional tags for filtering
)


In [3]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint
import optuna

from transformers import pipeline, BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.utils.class_weight import compute_class_weight

# Data

In [4]:
df = pd.read_parquet('final_17042025.parquet')

In [5]:
df

Unnamed: 0,response_id,document_id,user_id,annotator_sentiment,is_ck_annotation,response_timestamp,document_content,annotation_date,username,unique_document_id,language_wc,document_length,gpt_labels_v1,language_gpt,language_manual,language,stratification_label,df_set
0,1,1,277133851,neutral,1,2025-03-09T23:23:07.220881,‚ö°Ô∏è–£–∫—Ä–∞—ó–Ω—Å—å–∫–∞ –¥–µ–ª–µ–≥–∞—Ü—ñ—è –≤—ñ–¥–ø—Ä–∞–≤–∏–ª–∞—Å—è –Ω–∞ –ø–µ—Ä–µ–º–æ–≤...,2025-03-09,O,1_1,uk,67,neutral,Ukrainian,ukrainian,ua,neutral_ua,train
1,3,2,1065283664,neutral,1,2025-03-09T23:44:28.262307,"–í–∏–±—É—Ö–∏ –Ω–∞ –û–¥–µ—â–∏–Ω—ñ, –ø–æ–ø–µ—Ä–µ–¥–Ω—å–æ ‚Äî –ü–ü–û.",2025-03-09,A,2_1,uk,36,negative,Ukrainian,ukrainian,ua,neutral_ua,validation
2,4,3,1065283664,negative,1,2025-03-09T23:45:00.503098,"–ê —á—Ç–æ –¥–µ–ª–∞—Ç—å —Ç–µ–º ,–∫—Ç–æ –ª–∏—à–∏–ª—Å—è —Å–≤–æ–µ–≥–æ –∂–∏–ª—å—è ,–ø–æ...",2025-03-09,A,3_1,ru,177,negative,Code-mixed,russian,ru,negative_ru,test
3,5,4,1065283664,negative,1,2025-03-09T23:46:33.265766,–¢–æ–≥–¥–∞ —É—á–∏—Å—å –±—ã—Å—Ç—Ä–æ –±–µ–≥–∞—Ç—å. –î–ª—è –º–µ–Ω—è –≤–æ–ø—Ä–æ—Å —Å–ª–æ...,2025-03-09,A,4_1,ru,103,negative,Code-mixed,russian,ru,negative_ru,train
4,6,5,1065283664,neutral,1,2025-03-09T23:46:38.993496,–î–æ–±—Ä–∏–π –¥–µ–Ω—å,2025-03-09,A,5_1,uk,11,neutral,Ukrainian,russian,ua,neutral_ua,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12219,13028,8948,467130971,negative,0,2025-04-04T08:02:37.362562,"–ö—Ä–∞—â–µ ""–ø–æ–≤–∏–Ω–Ω–∞ –±—É—Ç–∏ –∑—Ä—É—á–Ω—ñ—à–æ—é, –Ω—ñ–∂ Uber —á–∏ Boo...",2025-04-04,D,8948_0,uk,51,positive,Code-mixed,ukrainian,ua,negative_ua,train
12220,13029,2094,467130971,mixed,0,2025-04-04T08:03:35.792932,–£–≤–∞–≥–∞! –ó –¥–µ—è–∫–∏—Ö —ñ–Ω—Ç–µ—Ä–Ω–µ—Ç –¥–∂–µ—Ä–µ–ª —à–∏—Ä–∏—Ç—å—Å—è —ñ–Ω—Ñ–æ—Ä...,2025-04-04,D,2094_0,uk,402,positive,Ukrainian,ukrainian,ua,mixed_ua,train
12221,13030,5013,467130971,neutral,0,2025-04-04T08:03:42.008533,"–ü–∏—Ç–∞–Ω–Ω—è, —Ü–µ–π —Å–µ—Ä—Ç–∏—Ñ—ñ–∫–∞—Ç –º–æ–∂–Ω–∞ –≤–∂–µ –≤–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É–≤...",2025-04-04,D,5013_0,uk,113,neutral,Ukrainian,ukrainian,ua,neutral_ua,train
12222,13031,4572,467130971,negative,0,2025-04-04T08:03:48.251166,–ù–∞ –í—É–≥–ª–µ–¥–∞—Ä—Å—å–∫–æ–º—É –Ω–∞–ø—Ä—è–º–∫—É –∑–∞–≥–∏–Ω—É–≤ –†–æ–º–∞ –Ü–≤–∞–Ω–µ–Ω...,2025-04-04,D,4572_0,uk,114,negative,Ukrainian,ukrainian,ua,negative_ua,train


In [6]:
# df = df.loc[df['annotator_sentiment'] != 'mixed']

In [7]:
df.shape

(12224, 18)

In [8]:
splits_df = {}

for sett in df.df_set.unique():
    splits_df[sett] = df.loc[df['df_set'] == sett].copy()

In [9]:
train_df = splits_df['train']
val_df = splits_df['validation']
test_df = splits_df['test']

In [10]:
# train_df = train_df.loc[:, ['document_content', 'annotator_sentiment']]

# Model

In [11]:
num_labels=df.annotator_sentiment.nunique()

In [12]:
num_labels

4

In [13]:
config = BertConfig.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=num_labels,
    hidden_dropout_prob=0.2,    # Increase from default (typically 0.1)
    attention_probs_dropout_prob=0.2
)

In [14]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", config=config)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tokenizer("Hello world")['input_ids']

[101, 31178, 11356, 102]

In [16]:
len(tokenizer.tokenize("Hello world"))

2

In [17]:
token_lengths = []
for text in df['document_content']:
    tokens = tokenizer.tokenize(text)
    token_lengths.append(len(tokens))

print(f"Average tokens per document: {np.mean(token_lengths)}")
print(f"Median tokens per document: {np.median(token_lengths)}")
print(f"Max tokens per document: {np.max(token_lengths)}")
print(f"Documents exceeding 512 tokens: {sum(np.array(token_lengths) > 512)}")

Average tokens per document: 47.95983996758837
Median tokens per document: 35.0
Max tokens per document: 1894
Documents exceeding 512 tokens: 40


In [18]:
len(token_lengths)

39492

In [20]:
# df['token_lengths'] = token_lengths

In [21]:
# df.loc[df.document_length > 1000]

# Training inputs

In [17]:
# Define maximum sequence length (check max length for your specific model)
MAX_LENGTH = 512

In [18]:
# Function to create data loaders
def create_data_loaders(train_dataset, val_dataset, test_dataset, batch_size=16):
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_loader, val_loader, test_loader

In [19]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512, strategy="truncate"):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.strategy = strategy

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Different strategies for handling long texts
        if self.strategy == "truncate":
            # Simple truncation from the beginning
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

        elif self.strategy == "head_tail":
            # Take first half tokens from beginning, second half from end
            tokens = self.tokenizer.tokenize(text)
            if len(tokens) > self.max_length - 2:  # Account for special tokens
                half_length = (self.max_length - 2) // 2
                tokens = tokens[:half_length] + tokens[-half_length:]

            encoding = self.tokenizer.encode_plus(
                self.tokenizer.convert_tokens_to_string(tokens),
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [20]:
# Function to process dataset with chosen strategy
def prepare_datasets(train_df, val_df, test_df, tokenizer, max_length=512, strategy="truncate"):
    # Encode the sentiment labels
    label_encoder = LabelEncoder()

    # Fit on the entire dataset to ensure all classes are included
    all_sentiments = pd.concat([
        train_df['annotator_sentiment'],
        val_df['annotator_sentiment'],
        test_df['annotator_sentiment']
    ])
    label_encoder.fit(all_sentiments)

    # Transform the labels
    train_labels = label_encoder.transform(train_df['annotator_sentiment'])
    val_labels = label_encoder.transform(val_df['annotator_sentiment'])
    test_labels = label_encoder.transform(test_df['annotator_sentiment'])

    # Create datasets
    train_dataset = SentimentDataset(
        train_df['document_content'].values,
        train_labels,
        tokenizer,
        max_length,
        strategy
    )

    val_dataset = SentimentDataset(
        val_df['document_content'].values,
        val_labels,
        tokenizer,
        max_length,
        strategy
    )

    test_dataset = SentimentDataset(
        test_df['document_content'].values,
        test_labels,
        tokenizer,
        max_length,
        strategy
    )

    return train_dataset, val_dataset, test_dataset, label_encoder

In [21]:
train_dataset, val_dataset, test_dataset, label_encoder = prepare_datasets(
    train_df, val_df, test_df, tokenizer, MAX_LENGTH, strategy="truncate" #head_tail
)

train_loader, val_loader, test_loader = create_data_loaders(
    train_dataset, val_dataset, test_dataset, batch_size=16
)

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training

In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./output",
    learning_rate=1.1735182865186952e-05,             # Common starting point for BERT
    num_train_epochs=10,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    # per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    weight_decay=0.012916490115700903,
    save_total_limit=10,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb",
    run_name="mbert_augmented"
)

## Cross entropy  loss

In [24]:
# Get class distribution
class_counts = np.bincount(label_encoder.transform(train_df['annotator_sentiment']))
print("Class distribution:", class_counts)

Class distribution: [ 487 3632 3761 1899]


In [25]:
# Calculate balanced weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(label_encoder.transform(train_df['annotator_sentiment'])),
    y=label_encoder.transform(train_df['annotator_sentiment'])
)

In [26]:
mixed_class_index = 0 

In [27]:
class_weights[mixed_class_index]

np.float64(5.020020533880904)

In [28]:
# class_weights[mixed_class_index] *= 1.5  # Additional boost

In [29]:
# Convert to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

In [30]:
{
    'LABEL_0': 'mixed',
    'LABEL_1': 'negative',
    'LABEL_2': 'neutral',
    'LABEL_3': 'positive',
}

{'LABEL_0': 'mixed',
 'LABEL_1': 'negative',
 'LABEL_2': 'neutral',
 'LABEL_3': 'positive'}

In [31]:
print("Class weights:", class_weights)

Class weights: [5.02002053 0.67311399 0.65002659 1.2873881 ]


In [32]:
class WeightedLossTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Compatible with all Transformers versions"""
        if "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = inputs.get("labels")
            
        outputs = model(**inputs)
        logits = outputs.logits
        
        if self.class_weights is not None:
            # Ensure weights are on the right device
            weights = self.class_weights.to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [33]:
trainer = WeightedLossTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [35]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
# )

In [34]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,1.3847,1.365499,0.356792,0.338279,0.342576,0.356792
100,1.3675,1.339361,0.453355,0.400239,0.432125,0.453355
150,1.3474,1.294607,0.53437,0.526463,0.525757,0.53437
200,1.286,1.181867,0.566285,0.569407,0.57604,0.566285
250,1.181,1.088824,0.591653,0.583431,0.618154,0.591653
300,1.1197,1.039178,0.572013,0.580403,0.620915,0.572013
350,1.047,1.041806,0.594108,0.599653,0.619175,0.594108
400,1.0224,1.065192,0.58838,0.585365,0.611997,0.58838
450,1.0582,0.98529,0.575286,0.596268,0.640784,0.575286
500,0.9926,1.016773,0.579378,0.591477,0.645369,0.579378


TrainOutput(global_step=2050, training_loss=0.8758560348138577, metrics={'train_runtime': 930.3387, 'train_samples_per_second': 105.112, 'train_steps_per_second': 3.289, 'total_flos': 1.7239872143450112e+16, 'train_loss': 0.8758560348138577, 'epoch': 6.699346405228758})

In [35]:
print(123)

123


In [36]:
trainer.save_model()

[1;34mwandb[0m: 
[1;34mwandb[0m: üöÄ View run [33mroberta-ukrainian-sentiment[0m at: [34mhttps://wandb.ai/shynkarov-pn-ukrainian-catholic-university/ukrainian-sentiment/runs/vl9tnzbt[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250420_080024-vl9tnzbt/logs[0m


# Hyper params tuning

In [28]:
def objective(trial):
    # Define the hyperparameter search space
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results_optuna",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=2,
        save_total_limit=10,
        # warmup_ratio=0.1,
        num_train_epochs=5,
        weight_decay=weight_decay,
        logging_dir="./logs_optuna",
        logging_steps=50,
        eval_steps=50,
        save_steps=50,
        eval_strategy="steps",
        save_strategy="steps",
        report_to="wandb",
        run_name="optuna",
        metric_for_best_model="f1",
    )

    # Initialize Trainer and train model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
    )
    trainer.train()

    # Evaluate the model and return validation accuracy
    eval_results = trainer.evaluate()
    return eval_results["eval_accuracy"]

In [None]:
# Create and run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2025-04-16 23:10:59,094] A new study created in memory with name: no-name-8513712f-27d0-42a3-b86a-4666b4e6c40c
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.9172,0.742903,0.677864,0.674121,0.693244,0.677864
100,0.7247,0.679379,0.707149,0.706546,0.70801,0.707149
150,0.6823,0.67973,0.70801,0.704029,0.725822,0.70801
200,0.6974,0.661064,0.721792,0.718715,0.726476,0.721792
250,0.6662,0.645199,0.732989,0.73071,0.742404,0.732989
300,0.6391,0.663719,0.717485,0.716716,0.72371,0.717485
350,0.5692,0.678832,0.722653,0.723358,0.726377,0.722653
400,0.5389,0.670532,0.721792,0.719441,0.729942,0.721792
450,0.5824,0.656048,0.72093,0.720588,0.722225,0.72093
500,0.5375,0.663014,0.736434,0.734845,0.742807,0.736434


[I 2025-04-16 23:18:29,295] Trial 0 finished with value: 0.7450473729543498 and parameters: {'learning_rate': 1.1735182865186952e-05, 'batch_size': 16, 'weight_decay': 0.012916490115700903}. Best is trial 0 with value: 0.7450473729543498.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.4546,0.820859,0.712317,0.710457,0.728403,0.712317
100,0.3908,0.819037,0.727821,0.727423,0.732244,0.727821
150,0.3396,0.795487,0.716624,0.714898,0.728954,0.716624
200,0.3899,0.819856,0.712317,0.713834,0.720759,0.712317
250,0.3748,0.763311,0.729543,0.729156,0.729498,0.729543


In [30]:
print("Best hyperparameters:", study.best_params)

Best hyperparameters: {'learning_rate': 1.1735182865186952e-05, 'batch_size': 16, 'weight_decay': 0.012916490115700903}


Best hyperparameters: {'learning_rate': 1.1735182865186952e-05, 'batch_size': 16, 'weight_decay': 0.012916490115700903}

In [31]:
study.best_params

{'learning_rate': 1.1735182865186952e-05,
 'batch_size': 16,
 'weight_decay': 0.012916490115700903}

# Classification report

In [36]:
# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [37]:
test_labels = label_encoder.transform(test_df['annotator_sentiment'])

In [38]:
label_encoder.classes_

array(['mixed', 'negative', 'neutral', 'positive'], dtype=object)

In [30]:
# Print classification report
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.60      0.05      0.09        60
           1       0.71      0.75      0.73       455
           2       0.65      0.67      0.66       471
           3       0.65      0.68      0.66       237

    accuracy                           0.67      1223
   macro avg       0.65      0.54      0.53      1223
weighted avg       0.67      0.67      0.66      1223



In [45]:
# Print classification report -- second try
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.32      0.32      0.32        60
           1       0.76      0.70      0.73       455
           2       0.64      0.75      0.69       471
           3       0.72      0.59      0.65       237

    accuracy                           0.68      1223
   macro avg       0.61      0.59      0.59      1223
weighted avg       0.68      0.68      0.68      1223



In [23]:
# Print classification report -- third try
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.33      0.03      0.06        60
           1       0.74      0.76      0.75       455
           2       0.66      0.79      0.72       471
           3       0.76      0.61      0.68       237

    accuracy                           0.71      1223
   macro avg       0.62      0.55      0.55      1223
weighted avg       0.69      0.71      0.69      1223



In [30]:
# Print classification report -- fourth try
print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.14      0.02      0.03        60
           1       0.72      0.75      0.74       455
           2       0.68      0.77      0.72       471
           3       0.71      0.64      0.67       237

    accuracy                           0.70      1223
   macro avg       0.56      0.54      0.54      1223
weighted avg       0.68      0.70      0.68      1223



In [30]:
# Print classification report -- fifth try
print(classification_report(test_labels, preds))
# trainer.save_model('best')

              precision    recall  f1-score   support

           0       0.72      0.84      0.77       455
           1       0.72      0.69      0.70       471
           2       0.79      0.63      0.70       237

    accuracy                           0.73      1163
   macro avg       0.74      0.72      0.73      1163
weighted avg       0.74      0.73      0.73      1163



In [None]:
# Print classification report -- sixth try, mixed class with cross entropy
print(classification_report(test_labels, preds))
# trainer.save_model('best')

In [39]:
# Print classification report -- seventh try, mixed class with cross entropy
print(classification_report(test_labels, preds))
# trainer.save_model('best')

              precision    recall  f1-score   support

           0       0.48      0.20      0.28        60
           1       0.71      0.77      0.74       455
           2       0.69      0.73      0.71       471
           3       0.77      0.67      0.72       237

    accuracy                           0.71      1223
   macro avg       0.66      0.59      0.61      1223
weighted avg       0.70      0.71      0.70      1223



In [35]:
# Print classification report -- mBERT
print(classification_report(test_labels, preds))
# trainer.save_model('best')

              precision    recall  f1-score   support

           0       0.36      0.20      0.26        60
           1       0.73      0.70      0.71       455
           2       0.64      0.77      0.70       471
           3       0.70      0.55      0.62       237

    accuracy                           0.67      1223
   macro avg       0.61      0.56      0.57      1223
weighted avg       0.67      0.67      0.67      1223



In [39]:
# Print classification report -- mBERT w.o augmentations
print(classification_report(test_labels, preds))
# trainer.save_model('best')

              precision    recall  f1-score   support

           0       0.20      0.25      0.22        60
           1       0.77      0.61      0.68       455
           2       0.64      0.76      0.70       471
           3       0.66      0.63      0.64       237

    accuracy                           0.65      1223
   macro avg       0.57      0.56      0.56      1223
weighted avg       0.67      0.65      0.66      1223

