# Imports

In [2]:
# import wandb
# wandb.login()

# wandb.init(
#     project="ukrainian-sentiment",  # Name your project
#     name="roberta-ukrainian-sentiment",  # Optional run name
#     tags=["roberta", "ukrainian", "sentiment"],  # Optional tags for filtering
# )

In [31]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

from transformers import pipeline, RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report


from safetensors import safe_open
from safetensors.torch import load_file

# Data

In [7]:
df = pd.read_parquet('./data_provided/final_dataset/final_17042025.parquet')

In [8]:
df

Unnamed: 0,response_id,document_id,user_id,annotator_sentiment,is_ck_annotation,response_timestamp,document_content,annotation_date,username,unique_document_id,language_wc,document_length,gpt_labels_v1,language_gpt,language_manual,language,stratification_label,df_set
0,1,1,277133851,neutral,1,2025-03-09T23:23:07.220881,⚡️Українська делегація відправилася на перемов...,2025-03-09,O,1_1,uk,67,neutral,Ukrainian,ukrainian,ua,neutral_ua,train
1,3,2,1065283664,neutral,1,2025-03-09T23:44:28.262307,"Вибухи на Одещині, попередньо — ППО.",2025-03-09,A,2_1,uk,36,negative,Ukrainian,ukrainian,ua,neutral_ua,validation
2,4,3,1065283664,negative,1,2025-03-09T23:45:00.503098,"А что делать тем ,кто лишился своего жилья ,по...",2025-03-09,A,3_1,ru,177,negative,Code-mixed,russian,ru,negative_ru,test
3,5,4,1065283664,negative,1,2025-03-09T23:46:33.265766,Тогда учись быстро бегать. Для меня вопрос сло...,2025-03-09,A,4_1,ru,103,negative,Code-mixed,russian,ru,negative_ru,train
4,6,5,1065283664,neutral,1,2025-03-09T23:46:38.993496,Добрий день,2025-03-09,A,5_1,uk,11,neutral,Ukrainian,russian,ua,neutral_ua,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12219,13028,8948,467130971,negative,0,2025-04-04T08:02:37.362562,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",2025-04-04,D,8948_0,uk,51,positive,Code-mixed,ukrainian,ua,negative_ua,train
12220,13029,2094,467130971,mixed,0,2025-04-04T08:03:35.792932,Увага! З деяких інтернет джерел шириться інфор...,2025-04-04,D,2094_0,uk,402,positive,Ukrainian,ukrainian,ua,mixed_ua,train
12221,13030,5013,467130971,neutral,0,2025-04-04T08:03:42.008533,"Питання, цей сертифікат можна вже використовув...",2025-04-04,D,5013_0,uk,113,neutral,Ukrainian,ukrainian,ua,neutral_ua,train
12222,13031,4572,467130971,negative,0,2025-04-04T08:03:48.251166,На Вугледарському напрямку загинув Рома Іванен...,2025-04-04,D,4572_0,uk,114,negative,Ukrainian,ukrainian,ua,negative_ua,train


In [9]:
df.shape

(12224, 18)

In [10]:
splits_df = {}

for sett in df.df_set.unique():
    splits_df[sett] = df.loc[df['df_set'] == sett].copy()

In [11]:
train_df = splits_df['train']
val_df = splits_df['validation']
test_df = splits_df['test']

In [None]:
# train_df = train_df.loc[:, ['document_content', 'annotator_sentiment']]

# Model

In [12]:
num_labels=df.annotator_sentiment.nunique()

In [13]:
num_labels

4

In [14]:
# Load model with increased dropout
config = RobertaConfig.from_pretrained(
    "youscan/ukr-roberta-base",
    num_labels=num_labels,
    hidden_dropout_prob=0.2,    # Increase from default (typically 0.1)
    attention_probs_dropout_prob=0.2
)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [15]:
model = RobertaForSequenceClassification.from_pretrained("youscan/ukr-roberta-base", num_labels=num_labels)
tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")

pytorch_model.bin:   0%|          | 0.00/507M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at youscan/ukr-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [16]:
tokenizer("Hello world")['input_ids']

[0, 44, 7802, 83, 4605, 14826, 2]

In [18]:
len(tokenizer.tokenize("Hello world"))

5

# Dataloaders

In [23]:
MAX_LENGTH = 512

In [24]:
# Function to create data loaders
def create_data_loaders(train_dataset, val_dataset, test_dataset, batch_size=16):
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_loader, val_loader, test_loader

In [25]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512, strategy="truncate"):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.strategy = strategy

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Different strategies for handling long texts
        if self.strategy == "truncate":
            # Simple truncation from the beginning
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

        elif self.strategy == "head_tail":
            # Take first half tokens from beginning, second half from end
            tokens = self.tokenizer.tokenize(text)
            if len(tokens) > self.max_length - 2:  # Account for special tokens
                half_length = (self.max_length - 2) // 2
                tokens = tokens[:half_length] + tokens[-half_length:]

            encoding = self.tokenizer.encode_plus(
                self.tokenizer.convert_tokens_to_string(tokens),
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [26]:
# Function to process dataset with chosen strategy
def prepare_datasets(train_df, val_df, test_df, tokenizer, max_length=512, strategy="truncate"):
    # Encode the sentiment labels
    label_encoder = LabelEncoder()

    # Fit on the entire dataset to ensure all classes are included
    all_sentiments = pd.concat([
        train_df['annotator_sentiment'],
        val_df['annotator_sentiment'],
        test_df['annotator_sentiment']
    ])
    label_encoder.fit(all_sentiments)

    # Transform the labels
    train_labels = label_encoder.transform(train_df['annotator_sentiment'])
    val_labels = label_encoder.transform(val_df['annotator_sentiment'])
    test_labels = label_encoder.transform(test_df['annotator_sentiment'])

    # Create datasets
    train_dataset = SentimentDataset(
        train_df['document_content'].values,
        train_labels,
        tokenizer,
        max_length,
        strategy
    )

    val_dataset = SentimentDataset(
        val_df['document_content'].values,
        val_labels,
        tokenizer,
        max_length,
        strategy
    )

    test_dataset = SentimentDataset(
        test_df['document_content'].values,
        test_labels,
        tokenizer,
        max_length,
        strategy
    )

    return train_dataset, val_dataset, test_dataset, label_encoder

In [27]:
train_dataset, val_dataset, test_dataset, label_encoder = prepare_datasets(
    train_df, val_df, test_df, tokenizer, MAX_LENGTH, strategy="truncate" #head_tail
)

train_loader, val_loader, test_loader = create_data_loaders(
    train_dataset, val_dataset, test_dataset, batch_size=16
)

In [28]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Read model state 

In [29]:
model_path = "./models/model_ukrroberta.safetensors"
# model = safetensors.load(model_path)

In [32]:
state_dict = load_file(model_path)

In [33]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [34]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
{
    'LABEL_0': 'mixed',
    'LABEL_1': 'negative',
    'LABEL_2': 'neutral',
    'LABEL_3': 'positive',
}

{'LABEL_0': 'mixed',
 'LABEL_1': 'negative',
 'LABEL_2': 'neutral',
 'LABEL_3': 'positive'}

In [35]:
def predict_sentiment(text, model, tokenizer):
    # Prepare the text input
    inputs = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
    
    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
    
    # Map the prediction to sentiment labels
    # Assuming the labels are: 0=negative, 1=neutral, 2=positive, 3=mixed (adjust as needed)
    sentiment_labels = ["mixed", "negative", "neutral", "positive"]
    predicted_sentiment = sentiment_labels[predictions.item()]
    
    return {
        'prediction': predictions.item(),
        'sentiment': predicted_sentiment,
        'scores': torch.nn.functional.softmax(logits, dim=1).tolist()[0]
    }

In [36]:
sample_text = "Це був чудовий день в Україні!"

In [37]:
result = predict_sentiment(sample_text, model, tokenizer)
print(f"Text: {sample_text}")
print(f"Predicted sentiment: {result['sentiment']}")
print(f"Confidence scores: {result['scores']}")

Text: Це був чудовий день в Україні!
Predicted sentiment: positive
Confidence scores: [0.01219471637159586, 0.0030130271334201097, 0.006361459847539663, 0.9784308075904846]


In [38]:
sample_text = "Страви були чудові! Але курєр був дууууже повільним і спізнився на 10 хвилин"

In [39]:
result = predict_sentiment(sample_text, model, tokenizer)
print(f"Text: {sample_text}")
print(f"Predicted sentiment: {result['sentiment']}")
print(f"Confidence scores: {result['scores']}")

Text: Страви були чудові! Але курєр був дууууже повільним і спізнився на 10 хвилин
Predicted sentiment: positive
Confidence scores: [0.2520085871219635, 0.037474825978279114, 0.014695622026920319, 0.6958209276199341]


# Expected calibration error

In [None]:
def evaluate_model(model, dataloader, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Generate classification report
    sentiment_labels = ["mixed", "negative", "neutral", "positive"]
    report = classification_report(
        all_labels, 
        all_preds, 
        target_names=sentiment_labels,
        digits=4
    )
    
    return report, all_labels, all_preds

In [None]:
report, y_true_col, y_pred_col = evaluate_model(model, test_loader, batch_size=16)

In [45]:
print(report)

              precision    recall  f1-score   support

       mixed     0.2143    0.0500    0.0811        60
    negative     0.8099    0.6088    0.6951       455
     neutral     0.6414    0.8280    0.7229       471
    positive     0.6332    0.6920    0.6613       237

    accuracy                         0.6819      1223
   macro avg     0.5747    0.5447    0.5401      1223
weighted avg     0.6816    0.6819    0.6691      1223



In [None]:
def evaluate_sentiment(df, y_true_col, y_pred_col, group_col="language"):
    """
    Evaluate sentiment classification with overall and per-language-group metrics.
    
    Params:
    - df: pd.DataFrame containing predictions and true labels
    - y_true_col: column name of true labels (e.g. human annotations)
    - y_pred_col: column name of model predictions (e.g. DeepSeek output)
    - group_col: column to group by (e.g. 'language')

    Returns:
    - dict with overall metrics and per-group metrics
    """
    y_true = df[y_true_col]
    y_pred = df[y_pred_col]

    # Overall metrics
    overall_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    overall_micro = precision_recall_fscore_support(y_true, y_pred, average='micro', zero_division=0)

    result = {
        "overall": {
            "macro": {
                "precision": overall_macro[0],
                "recall": overall_macro[1],
                "f1": overall_macro[2],
            },
            "micro": {
                "precision": overall_micro[0],
                "recall": overall_micro[1],
                "f1": overall_micro[2],
            }
        },
        "by_group": {}
    }

    # Per-language group metrics
    for group_value in df[group_col].unique():
        subset = df[df[group_col] == group_value]
        if subset.empty:
            continue

        group_true = subset[y_true_col]
        group_pred = subset[y_pred_col]

        macro = precision_recall_fscore_support(group_true, group_pred, average='macro', zero_division=0)
        micro = precision_recall_fscore_support(group_true, group_pred, average='micro', zero_division=0)

        result["by_group"][group_value] = {
            "macro": {
                "precision": macro[0],
                "recall": macro[1],
                "f1": macro[2],
            },
            "micro": {
                "precision": micro[0],
                "recall": micro[1],
                "f1": micro[2],
            }
        }

    return result

In [None]:
metrics = evaluate_sentiment(df_filtered, y_true_col="annotator_response", y_pred_col="sentiment_deepseek")
pprint.pprint(metrics)