In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import re
from sklearn.model_selection import train_test_split

2025-03-26 10:25:50.798726: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def wordopt(text):
    text = re.sub(r'https?://\S+|www\.\S+', '[URL]', text)
    text = text.replace('\n', ' ')
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
df = pd.read_csv("../../datasets/CC_WELF_merged_cleaned.csv")
df["word_count"] = df["text"].apply(lambda x: len(x.split()))
df = df[df["word_count"] >= 30]
df.drop(columns=["word_count"], inplace=True)
df = df[df["language"] == 'en']
df["text"] = df["text"].apply(wordopt)

texts = df["text"].tolist()
labels = df["label"].tolist()

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [8]:
class RobertaDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts,
                                   truncation=True,
                                   padding="max_length",
                                   max_length=max_length,
                                   return_tensors="pt")
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [9]:
tokenizer = RobertaTokenizer.from_pretrained("saved_models/roberta/roberta_tokenizer")
max_length = 512

test_dataset = RobertaDataset(test_texts, test_labels, tokenizer, max_length)


In [12]:
calibration_size = int(0.5 * len(test_dataset))
evaluation_size = len(test_dataset) - calibration_size
calibration_dataset, evaluation_dataset = random_split(test_dataset, [calibration_size, evaluation_size])


In [13]:
model = RobertaForSequenceClassification.from_pretrained("saved_models/roberta/roberta_torch_model")
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
class TemperatureScaler(nn.Module):
    def __init__(self):
        super(TemperatureScaler, self).__init__()
        # Folosim log_temperature pentru a garanta T > 0 (T = exp(log_temperature))
        self.log_temperature = nn.Parameter(torch.zeros(1))
    def forward(self, logits):
        temperature = torch.exp(self.log_temperature)
        return logits / temperature

In [11]:
def calibrate_model(model, calibration_dataset, batch_size=32):
    dataloader = DataLoader(calibration_dataset, batch_size=batch_size, shuffle=False)
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["labels"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            all_logits.append(logits)
            all_labels.append(labels)
    all_logits = torch.cat(all_logits)
    all_labels = torch.cat(all_labels)
    
    # Instanțiem modulul de temperature scaling
    temp_scaler = TemperatureScaler()
    optimizer = optim.LBFGS(temp_scaler.parameters(), lr=0.01, max_iter=50)

    def loss_fn():
        optimizer.zero_grad()
        scaled_logits = temp_scaler(all_logits)
        loss = F.cross_entropy(scaled_logits, all_labels)
        loss.backward()
        return loss

    optimizer.step(loss_fn)
    
    optimal_temperature = torch.exp(temp_scaler.log_temperature).item()
    print("Temperatura optimă: {:.4f}".format(optimal_temperature))
    return optimal_temperature, temp_scaler

In [14]:
optimal_temp, temp_scaler_module = calibrate_model(model, calibration_dataset)

Temperatura optimă: 1.1423


In [15]:
def predict_with_calibration(model, temp_scaler, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            scaled_logits = temp_scaler(logits)
            probs = torch.softmax(scaled_logits, dim=-1)
            preds = torch.argmax(probs, dim=-1)
            predictions.extend(preds.cpu().numpy().tolist())
    return predictions

In [16]:
eval_dataloader = DataLoader(evaluation_dataset, batch_size=32, shuffle=False)
predictions = predict_with_calibration(model, temp_scaler_module, eval_dataloader)

In [18]:
class CalibratedModel(nn.Module):
    def __init__(self, model, temp_scaler):
        super(CalibratedModel, self).__init__()
        self.model = model          # Modelul RoBERTa deja antrenat
        self.temp_scaler = temp_scaler  # Modulul TemperatureScaler calibrat

    def forward(self, input_ids, attention_mask):
        # Obținem logit-urile din model
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        # Aplicăm temperature scaling pentru a obține logit-uri calibrate
        scaled_logits = self.temp_scaler(logits)
        # Calculăm probabilitățile calibrate
        calibrated_probs = F.softmax(scaled_logits, dim=-1)
        return calibrated_probs


In [19]:
calibrated_model = CalibratedModel(model, temp_scaler_module)

In [20]:
torch.save(calibrated_model.state_dict(), "calibrated_model.pth")