In [None]:
# !pip install transformers torch scikit-learn unidecode datasets

In [None]:
# Importar las librerías
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer,  AutoModelForSequenceClassification

In [None]:
lang = "ukr"
model_name = 'google-bert/bert-base-multilingual-uncased'
max_token_len = 71
num_epochs = 10
lr = 2e-5
save_model = True

In [None]:
# Leer los archivos de Google Drive
train_path = f'/content/drive/MyDrive/Proyectos/semeval/data/newest/train/{lang}.csv'
val_path = f'/content/drive/MyDrive/Proyectos/semeval/data/newest/dev/{lang}.csv'
test_path = f'/content/drive/MyDrive/Proyectos/semeval/data/newest/dev/{lang}.csv'


In [None]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

In [None]:
df_train = pd.concat([df_train, df_val], ignore_index=True)

In [None]:
col_names = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [None]:
for x in col_names:
  df_train[x] = df_train.apply(lambda row: 1 if row[x] > 0 else 0, axis=1)
  df_val[x] = df_val.apply(lambda row: 1 if row[x] > 0 else 0, axis=1)
  df_test[x] = df_val.apply(lambda row: 1 if row[x] > 0 else 0, axis=1)

In [None]:
# Convertir las columnas de emociones en listas de etiquetas con tipo float32
df_train[col_names] = df_train[col_names].astype('float32')
df_val[col_names] = df_val[col_names].astype('float32')
df_test[col_names] = df_test[col_names].astype('float32')

In [None]:
# Convertir las columnas de emociones en listas de etiquetas
df_train['labels'] = df_train[col_names].values.tolist()
df_val['labels'] = df_val[ col_names ].values.tolist()
df_test['labels'] = df_test[col_names].values.tolist()


In [None]:
df_train["labels"] = df_train.apply(lambda x: [float(y) for y in x["labels"]], axis=1)
df_val["labels"] = df_val.apply(lambda x: [float(y) for y in x["labels"]], axis=1)
df_test["labels"] = df_test.apply(lambda x: [float(y) for y in x["labels"]], axis=1)


In [None]:
df_train

In [None]:
# Convertir a Dataset de Hugging Face
train_dataset = Dataset.from_pandas(df_train[['text', 'labels']])
val_dataset = Dataset.from_pandas(df_val[['text', 'labels']])
test_dataset = Dataset.from_pandas(df_test[['text', 'labels']])


In [None]:
# Tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
# Tokenización
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_token_len)


In [None]:
# Tokenización
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


In [None]:
# Definir el modelo para clasificación multilabel
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(col_names), problem_type="multi_label_classification")
model.config.hidden_dropout_prob = 0.3  # Ajustar el dropout al 30%


In [None]:
# Definir métrica de evaluación
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions >= 0.5).astype(int)  # Convertir logits a 0 o 1
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}


In [None]:
# Argumentos del entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

In [None]:
# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=lr), None)
)

In [None]:
# Entrenar el modelo
trainer.train()

In [None]:
model.eval()

### evaluacion

In [None]:
# Evaluación
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    labels = torch.stack([torch.tensor(item['labels']) for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [None]:
# Realizar inferencias y calcular métricas
all_preds, all_labels = [], []

for batch in test_loader:
    input_ids = batch['input_ids'].to(model.device)
    attention_mask = batch['attention_mask'].to(model.device)
    labels = batch['labels'].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = (torch.sigmoid(logits) > 0.5).int().cpu().numpy()
    labels = labels.cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)


In [None]:
# Mostrar métricas
print(classification_report(all_labels, all_preds, target_names=col_names))

In [None]:
# Convertir all_preds y all_labels a arrays de dos dimensiones
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

# Calcular accuracy por emoción (por columna)
emotion_accuracy = np.mean(all_preds == all_labels, axis=0)
for i, emotion in enumerate(col_names):
    print(f"Accuracy for {emotion}: {emotion_accuracy[i]:.2f}")

# Calcular accuracy global (exact match ratio)
exact_match_accuracy = np.mean(np.all(all_preds == all_labels, axis=1))
print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}")

# Calcular accuracy promedio por muestra
sample_accuracy = np.mean(all_preds == all_labels)
print(f"Sample Accuracy (Average Accuracy): {sample_accuracy:.2f}")

In [None]:
# Guardar el modelo en Google Drive

output_dir = f'/content/drive/MyDrive/Proyectos/semeval/models/{lang}/multilabel/'

if save_model:
  model.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)
