In [1]:
import time
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd
import numpy as np
import random
!pip install datasets
!pip install transformers
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import os


# ✅ Set Seed for Reproducibility
seed = 64
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)

# ✅ Define model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Load dataset from single Excel file
file_path = "/content/jemmy_final_dataset.xlsx"
df = pd.read_excel(file_path)

# ✅ Convert labels to numerical values
label_mapping = {"NoAg": 3, "lifeth": 2, "vndlsm": 1, "hate": 0}
df['label'] = df['Rate2'].map(label_mapping)

# ✅ Split dataset into Train (80%), Validation (10%), and Test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['label'])
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed, stratify=temp_df['label'])

# ✅ Convert Pandas DataFrame to Hugging Face Dataset format
hf_train_dataset = Dataset.from_pandas(train_df)
hf_validation_dataset = Dataset.from_pandas(validation_df)
hf_test_dataset = Dataset.from_pandas(test_df)

# ✅ Tokenization function
def tokenize(batch):
    return tokenizer(batch['Text'], padding="max_length", max_length=512, truncation=True)

# ✅ Create DatasetDict for train & validation
data = DatasetDict({'train': hf_train_dataset, 'validation': hf_validation_dataset})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_encoded = data.map(tokenize, batched=True, batch_size=16)

# ✅ Define XLM-RoBERTa model with attention and dropout
class XLMRobertaWithAttention(nn.Module):
    def __init__(self, model_name: str, num_labels: int, dropout_rate: float = 0.2):
        super(XLMRobertaWithAttention, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        for param in self.base_model.parameters():
            param.data = param.data.contiguous()
        self.base_model.config.hidden_dropout_prob = dropout_rate
        self.base_model.config.attention_probs_dropout_prob = dropout_rate
        self.hidden_size = self.base_model.config.hidden_size
        self.attention = nn.Linear(self.hidden_size, 1).to(torch.float32)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state.contiguous()
        attn_weights = torch.tanh(self.attention(hidden_states))
        attn_weights = torch.softmax(attn_weights, dim=1)
        weighted_output = torch.sum(attn_weights * hidden_states, dim=1)
        weighted_output = self.dropout(weighted_output.contiguous())
        logits = self.classifier(weighted_output)
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# ✅ Define model
num_labels = 4
dropout_rate = 0.4
model = XLMRobertaWithAttention(model_name, num_labels, dropout_rate).to("cuda")

# ✅ Define function to compute accuracy and F1-score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"Accuracy": accuracy_score(labels, preds), "F1 Score": f1_score(labels, preds, average="weighted")}

# ✅ Define training arguments
batch_size = 8
logging_steps = len(data_encoded["train"]) // batch_size
training_args = TrainingArguments(
    output_dir=f"{model_name}-sntmnt-fntnd-2",
    num_train_epochs=10,
    learning_rate=5e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_Accuracy",
    greater_is_better=True,
    logging_steps=logging_steps,
    log_level="error",
    report_to="none"
)

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=data_encoded['train'],
    eval_dataset=data_encoded['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Train the model with tracking time
start_time = time.time()
trainer.train()
end_time = time.time()

# ✅ Print overall training time
total_time = end_time - start_time
print(f"✅ Total training time: {total_time:.2f} seconds")

# ✅ Save the trained model
model_save_path = "xlm_roberta_trained_model.pth"
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained("xlm_roberta_trained_tokenizer")
print("✅ Model saved successfully!")

# ✅ Evaluate on validation set
validation_results = trainer.evaluate()
print("Validation Set Results:", validation_results)

# ✅ Encode the test set before evaluation
hf_test_dataset_encoded = hf_test_dataset.map(tokenize, batched=True, batch_size=16)

# ✅ Evaluate on test set
test_results = trainer.predict(hf_test_dataset_encoded)
test_metrics = compute_metrics(test_results)
print("Test Set Results:", test_metrics)

# ✅ Save True Labels and Predicted Probabilities for Ensemble
y_true = test_results.label_ids
y_logits = test_results.predictions
y_prob = F.softmax(torch.tensor(y_logits), dim=1).numpy()

results_df = pd.DataFrame({
    "y_true": y_true,
    "model_1_prob_class_0": y_prob[:, 0],
    "model_1_prob_class_1": y_prob[:, 1],
    "model_1_prob_class_2": y_prob[:, 2],
    "model_1_prob_class_3": y_prob[:, 3]
})

results_df.to_csv("xlm_roberta_attention_predictions.csv", index=False)
print("✅ Predictions saved successfully!")


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/3202 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 score
1,0.9609,0.614461,0.7825,0.782125
2,0.5508,0.591494,0.825,0.824621
3,0.4431,0.685978,0.78,0.777998
4,0.3795,0.679676,0.8175,0.818226
5,0.3398,0.852556,0.8325,0.83181
6,0.3299,0.866133,0.835,0.834387
7,0.2953,0.969345,0.8325,0.831349
8,0.2496,0.92826,0.8325,0.832022
9,0.2371,0.923894,0.8325,0.832672
10,0.246,0.950422,0.8375,0.837672


✅ Total training time: 3742.22 seconds
✅ Model saved successfully!


Validation Set Results: {'eval_loss': 0.9504220485687256, 'eval_Accuracy': 0.8375, 'eval_F1 Score': 0.8376721945008581, 'eval_runtime': 9.7016, 'eval_samples_per_second': 41.23, 'eval_steps_per_second': 5.154, 'epoch': 10.0}


Map:   0%|          | 0/401 [00:00<?, ? examples/s]

Test Set Results: {'Accuracy': 0.8503740648379052, 'F1 Score': 0.8497263822025447}
✅ Predictions saved successfully!
