In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [None]:
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [None]:
def preprocess_text(text):
    import re
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)
valid_df['text'] = valid_df['text'].apply(preprocess_text)

def balance_dataset(dataframe):
    # Calculate the sum of each label across the dataset
    label_counts = dataframe.iloc[:, 1:].sum()

    # Determine the maximum class size
    max_class_size = label_counts.max()

    # Create a balanced dataframe
    balanced_frames = []
    for label in label_counts.index:
        label_frame = dataframe[dataframe[label] == 1]
        if len(label_frame) > 0:
            upsampled_frame = resample(
                label_frame,
                replace=True,  # Sample with replacement
                n_samples=max_class_size,  # Match majority class size
                random_state=42
            )
            balanced_frames.append(upsampled_frame)

    balanced_df = pd.concat(balanced_frames)
    return balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df = balance_dataset(train_df)

In [None]:
MAX_LENGTH = 128
BATCH_SIZE = 256
EPOCHS = 15
LEARNING_RATE = 3e-4

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe.iloc[:, 1:].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': labels
        }

tokenizer = AutoTokenizer.from_pretrained("./results/checkpoint-573")
model = AutoModelForSequenceClassification.from_pretrained(
    "./results/checkpoint-573", 
    num_labels=7, 
    problem_type="multi_label_classification"
)

train_dataset = EmotionDataset(train_df, tokenizer, MAX_LENGTH)
valid_dataset = EmotionDataset(valid_df, tokenizer, MAX_LENGTH)

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

training_args = TrainingArguments(
    output_dir='./results/rosberta/',
    eval_strategy="epoch",                           
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    lr_scheduler_type="cosine", 
    warmup_steps=300,
    save_strategy="epoch", 
    save_total_limit=2, 
    save_on_each_node=True,


)

def compute_metrics(pred):
    import torch
    from sklearn.metrics import f1_score, accuracy_score
    logits, labels = pred
    # Применяем сигмоиду для получения вероятностей
    probabilities = torch.sigmoid(torch.tensor(logits)).numpy()
    # Бинаризация предсказаний с порогом 0.5
    predictions = (probabilities > 0.5).astype(int)
    # Подсчет метрик
    f1 = f1_score(labels, predictions, average="weighted", zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()