In [None]:
from datasets import load_dataset


In [None]:
dataset = load_dataset('cardiffnlp/tweet_topic_single')

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer , AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Load pre-trained TweetRoBERTa model and tokenizer
model_name = "vinai/bertweet-base"
model = AutoModel.from_pretrained("vinai/bertweet-large")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

train_texts = dataset["train_all"]["text"][0]

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
print(train_encodings)

In [None]:
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer , AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# Load pre-trained TweetRoBERTa model and tokenizer
model_name = "vinai/bertweet-large"
model = RobertaForSequenceClassification.from_pretrained("vinai/bertweet-large", num_labels=6)
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large") 

# Load and preprocess your existing train, test, and validation datasets
train_texts = dataset["train_all"]["text"]  # List of training tweet texts
train_labels = dataset["train_all"]["label"]  # List of training labels (topic classes)
test_texts = dataset["test_2021"]["text"] # List of testing tweet texts
test_labels = dataset["test_2021"]["label"]  # List of testing labels (topic classes)


# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create PyTorch datasets
train_dataset = TensorDataset(torch.tensor(train_encodings["input_ids"]),
                              torch.tensor(train_encodings["attention_mask"]),
                              torch.tensor(train_labels))
test_dataset = TensorDataset(torch.tensor(test_encodings["input_ids"]),
                             torch.tensor(test_encodings["attention_mask"]),
                             torch.tensor(test_labels))

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
model.to(device)
model.train()

for epoch in range(8):  # Number of training epochs
    epoch_loss = 0.0
    epoch_accuracy = 0.0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted_labels = torch.max(logits, 1)
        epoch_accuracy += accuracy_score(labels.cpu().numpy(), predicted_labels.cpu().numpy())

    epoch_loss /= len(train_loader)
    epoch_accuracy /= len(train_loader)

    print(f"Epoch {epoch + 1} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

    # Evaluation on the test set
    model.eval()
    test_accuracy = 0.0
    test_f1_score_macro = 0.0
    test_f1_score_micro = 0.0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, 1)
            test_accuracy += accuracy_score(labels.cpu().numpy(), predicted_labels.cpu().numpy())
            test_f1_score_macro += f1_score(labels.cpu().numpy(), predicted_labels.cpu().numpy(), average= 'macro')
            test_f1_score_micro += f1_score(labels.cpu().numpy(), predicted_labels.cpu().numpy(), average= 'micro')
            

    test_accuracy /= len(test_loader)
    print(f"test_2021 Accuracy: {test_accuracy:.4f}")

    test_f1_score_macro /= len(test_loader)
    print(f"test_2021 F1 Score (macro): {test_accuracy:.4f}")

    test_f1_score_micro /= len(test_loader)
    print(f"test_2021 F1 Score (micro): {test_accuracy:.4f}")


Epoch 1 - Loss: 0.7067 - Accuracy: 0.7553
test_2020 Accuracy: 0.8993
test_2021 Accuracy: 0.8786
Epoch 2 - Loss: 0.2709 - Accuracy: 0.9083
test_2020 Accuracy: 0.8984
test_2021 Accuracy: 0.8875
Epoch 3 - Loss: 0.1194 - Accuracy: 0.9629
test_2020 Accuracy: 0.8837
test_2021 Accuracy: 0.8809
Epoch 4 - Loss: 0.0500 - Accuracy: 0.9859
test_2020 Accuracy: 0.8967
test_2021 Accuracy: 0.8927
Epoch 5 - Loss: 0.0190 - Accuracy: 0.9959
test_2020 Accuracy: 0.8941
test_2021 Accuracy: 0.8927
Epoch 6 - Loss: 0.0065 - Accuracy: 0.9993
test_2020 Accuracy: 0.9019
test_2021 Accuracy: 0.8968
Epoch 7 - Loss: 0.0026 - Accuracy: 0.9995
test_2020 Accuracy: 0.8932
test_2021 Accuracy: 0.8980
Epoch 8 - Loss: 0.0046 - Accuracy: 0.9995
test_2020 Accuracy: 0.8958
test_2021 Accuracy: 0.8998
Epoch 9 - Loss: 0.0144 - Accuracy: 0.9959
test_2020 Accuracy: 0.8845
test_2021 Accuracy: 0.8900
Epoch 10 - Loss: 0.0376 - Accuracy: 0.9906
test_2020 Accuracy: 0.9097
test_2021 Accuracy: 0.8905