### cardiffnlp/twitter-roberta-base-sentiment
https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment?text=I+like+you.+I+love+you



### No. 1

In [14]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import torch
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
from transformers import EarlyStoppingCallback

os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["WANDB_DISABLED"] = "true"

# Load data
train_df = pd.read_csv('Preprocessed_Dataset/train.csv')
val_df = pd.read_csv('Preprocessed_Dataset/dev.csv')
test_df = pd.read_csv('Preprocessed_Dataset/test.csv')

# Convert DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge datasets
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Tokenizer
model_checkpoint = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# Tokenize
def preprocess_function(examples):
    cleaned_texts = [str(text) if text else "" for text in examples["text"]]
    return tokenizer(cleaned_texts, truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the model
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, ignore_mismatched_sizes=True)  # 2 classes

# Data Collector
data_collator = DataCollatorWithPadding(tokenizer)

# Evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = torch.argmax(torch.tensor(predictions), dim=-1)
    accuracy = (preds == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

# parameters
training_args = TrainingArguments(
    output_dir="./results_1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs_1',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# learning rate scheduler
num_training_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=num_training_steps)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2) 

# Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Train
trainer.train()

# Evaluate the model
print("Validation metrics:")
val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print(val_metrics)

print("Test metrics:")
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_metrics)

# Save
model.save_pretrained('./trained_model_1')
tokenizer.save_pretrained('./trained_model_1')

# Get predictions for the test set using the trained model
def get_predictions_and_logits(trainer, dataset):
    predictions, labels, metrics = trainer.predict(dataset)
    
    # Convert logits to predicted labels (0 or 1)
    predicted_labels = np.argmax(predictions, axis=-1)

    return predictions, predicted_labels, labels

# Get predictions, logits, and true labels for the test dataset
predictions, predicted_labels, true_labels = get_predictions_and_logits(trainer, tokenized_datasets["test"])

# Convert logits to pandas DataFrame for easy manipulation
logits_df = pd.DataFrame(predictions, columns=["logit_" + str(i) for i in range(predictions.shape[1])])

# Create a DataFrame with text, true labels, and predicted labels
results_df = pd.DataFrame({
    'text': tokenized_datasets["test"]["text"],
    'label': true_labels,
    'label_pred': predicted_labels
})

# Concatenate logits DataFrame with the results_df
final_df = pd.concat([results_df, logits_df], axis=1)
final_df.to_csv("test_predictions_with_logits_1.csv", index=False)

# Filter out misclassified samples
misclassified_df = results_df[results_df['label'] != results_df['label_pred']]
misclassified_df.to_csv('misclassified_samples_1.csv', index=False)

# Count per label
misclassified_counts = misclassified_df['label'].value_counts()

# Display distribution
print(f"Misclassification distribution:\n{misclassified_counts}")

loading file vocab.json from cache at /Users/qiguo/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/vocab.json
loading file merges.txt from cache at /Users/qiguo/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/qiguo/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/special_tokens_map.json
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/qiguo/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/config.json
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectu

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /Users/qiguo/.cache/huggingface/hub/models--cardiffnlp--twitter-roberta-base-sentiment/snapshots/daefdd1f6ae931839bce4d0f3db0a1a4265cd50f/config.json
Model config RobertaConfig {
  "_name_or_path": "tweeteval_new/roberta-base-rt-sentiment/",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cach

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### No. 3

In [13]:
# Load data
train_df = pd.read_csv('LT-EDI-ACL2022_Dataset/train.csv')
dev_df = pd.read_csv('LT-EDI-ACL2022_Dataset/dev.tsv', sep='\t')
test_df = pd.read_csv('LT-EDI-ACL2022_Dataset/test.csv')

# Drop unnecessary columns
train_df.drop("pid", axis=1, inplace=True)
dev_df.drop("PID", axis=1, inplace=True)
test_df.drop("pid", axis=1, inplace=True)

# Rename columns for dev_df
dev_df = dev_df.rename(columns={'Text data': 'text', 'Label': 'labels'})

# Map label names to integers
label_mapping = {'severe': 0, 'moderate': 1, 'not depression': 2}
dev_df['labels'] = dev_df['labels'].map(label_mapping)

# Convert DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge datasets into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Tokenizer: Load from last training
model_checkpoint = './trained_model_1'
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# Tokenize the dataset
def preprocess_function(examples):
    cleaned_texts = [str(text) if text else "" for text in examples["text"]]
    return tokenizer(cleaned_texts, truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the base model (Roberta base) for sequence classification with 3 classes
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3, ignore_mismatched_sizes=True)

# Data Collator: Use padding
data_collator = DataCollatorWithPadding(tokenizer)

# Define evaluation metrics (accuracy)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = torch.argmax(torch.tensor(predictions), dim=-1)
    accuracy = (preds == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results_3",  # Updated output directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs_3',  # Updated logging directory
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Scheduler for learning rate
num_training_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

# Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Train the model
trainer.train()

# Evaluate the model
print("Validation metrics:")
val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print(val_metrics)

print("Test metrics:")
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_metrics)

# Save the trained model and tokenizer
model.save_pretrained('./trained_model_3')
tokenizer.save_pretrained('./trained_model_3')

# Get predictions for the test set using the trained model
def get_predictions_and_logits(trainer, dataset):
    predictions, labels, metrics = trainer.predict(dataset)
    
    # Convert logits to predicted labels 
    predicted_labels = np.argmax(predictions, axis=-1)

    return predictions, predicted_labels, labels

# Get predictions, logits, and true labels for the test dataset
predictions, predicted_labels, true_labels = get_predictions_and_logits(trainer, tokenized_datasets["test"])

# Convert logits to pandas DataFrame for easy manipulation
logits_df = pd.DataFrame(predictions, columns=["logit_" + str(i) for i in range(predictions.shape[1])])

# Create a DataFrame with text, true labels, and predicted labels
results_df = pd.DataFrame({
    'text': tokenized_datasets["test"]["text"],
    'label': true_labels,
    'label_pred': predicted_labels
})

# Concatenate logits DataFrame with the results_df
final_df = pd.concat([results_df, logits_df], axis=1)
final_df.to_csv("test_predictions_with_logits_3.csv", index=False)

# Filter out misclassified samples
misclassified_df = results_df[results_df['label'] != results_df['label_pred']]
misclassified_df.to_csv('misclassified_samples_3.csv', index=False)

# Count per label
misclassified_counts = misclassified_df['label'].value_counts()

# Display distribution of misclassified labels
print(f"Misclassification distribution:\n{misclassified_counts}")

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

loading configuration file ./trained_model_1/config.json
Model config RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_c

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 