### FacebookAI/roberta-base
https://huggingface.co/FacebookAI/roberta-base

### No. 6

In [8]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import torch
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
from transformers import EarlyStoppingCallback

os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["WANDB_DISABLED"] = "true"

# Load data
train_df = pd.read_csv('LT-EDI-ACL2022_Dataset/train.csv')
dev_df = pd.read_csv('LT-EDI-ACL2022_Dataset/dev.tsv', sep='\t')
test_df = pd.read_csv('LT-EDI-ACL2022_Dataset/test.csv')

# Drop unnecessary columns
train_df.drop("pid", axis=1, inplace=True)
dev_df.drop("PID", axis=1, inplace=True)
test_df.drop("pid", axis=1, inplace=True)

# Rename columns for dev_df
dev_df = dev_df.rename(columns={'Text data': 'text', 'Label': 'labels'})

# Map label names to integers
label_mapping = {'severe': 0, 'moderate': 1, 'not depression': 2}
dev_df['labels'] = dev_df['labels'].map(label_mapping)

# Convert DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge datasets into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Tokenizer: Load from FacebookAI/roberta-base
model_checkpoint = "roberta-base"  # Changed to FacebookAI's base model
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# Tokenize the dataset
def preprocess_function(examples):
    cleaned_texts = [str(text) if text else "" for text in examples["text"]]
    return tokenizer(cleaned_texts, truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the base model (Roberta base) for sequence classification with 3 classes
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3, ignore_mismatched_sizes=True)

# Data Collator: Use padding
data_collator = DataCollatorWithPadding(tokenizer)

# Define evaluation metrics (accuracy)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = torch.argmax(torch.tensor(predictions), dim=-1)
    accuracy = (preds == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results_6",  # Updated output directory
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir='./logs_6',  # Updated logging directory
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Scheduler for learning rate
num_training_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2) 

# Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)


# Train the model
trainer.train()

# Evaluate the model
print("Validation metrics:")
val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print(val_metrics)

print("Test metrics:")
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_metrics)

# Save the trained model and tokenizer
model.save_pretrained('./trained_model_6')
tokenizer.save_pretrained('./trained_model_6')

# Get predictions for the test set using the trained model
def get_predictions_and_logits(trainer, dataset):
    predictions, labels, metrics = trainer.predict(dataset)
    
    # Convert logits to predicted labels 
    predicted_labels = np.argmax(predictions, axis=-1)

    return predictions, predicted_labels, labels

# Get predictions, logits, and true labels for the test dataset
predictions, predicted_labels, true_labels = get_predictions_and_logits(trainer, tokenized_datasets["test"])

# Convert logits to pandas DataFrame for easy manipulation
logits_df = pd.DataFrame(predictions, columns=["logit_" + str(i) for i in range(predictions.shape[1])])

# Create a DataFrame with text, true labels, and predicted labels
results_df = pd.DataFrame({
    'text': tokenized_datasets["test"]["text"],
    'label': true_labels,
    'label_pred': predicted_labels
})

# Concatenate logits DataFrame with the results_df
final_df = pd.concat([results_df, logits_df], axis=1)
final_df.to_csv("test_predictions_with_logits_6.csv", index=False)

# Filter out misclassified samples
misclassified_df = results_df[results_df['label'] != results_df['label_pred']]
misclassified_df.to_csv('misclassified_samples_6.csv', index=False)

# Count per label
misclassified_counts = misclassified_df['label'].value_counts()

# Display distribution of misclassified labels
print(f"Misclassification distribution:\n{misclassified_counts}")

loading file vocab.json from cache at /Users/qiguo/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/vocab.json
loading file merges.txt from cache at /Users/qiguo/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/qiguo/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/tokenizer_config.json
loading configuration file config.json from cache at /Users/qiguo/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /Users/qiguo/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.b

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7783,0.71431,0.669706
2,0.6783,0.612722,0.731317
3,0.6651,0.564246,0.771575
4,0.4862,0.461222,0.848754
5,0.4331,0.503167,0.836966
6,0.3366,0.596253,0.860543
7,0.0794,0.615508,0.877224
8,0.093,0.749697,0.88968
9,0.1143,0.788997,0.885676
10,0.0706,0.81583,0.890347


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4496
  Batch size = 16
Saving model checkpoint to ./results_6/checkpoint-376
Configuration saved in ./results_6/checkpoint-376/config.json
Model weights saved in ./results_6/checkpoint-376/pytorch_model.bin
tokenizer config file saved in ./results_6/checkpoint-376/tokenizer_config.json
Special tokens file saved in ./results_6/checkpoint-376/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4496
  Ba

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4496
  Batch size = 16
Saving model checkpoint to ./results_6/checkpoint-4888
Configuration saved in ./results_6/checkpoint-4888/config.json
Model weights saved in ./results_6/checkpoint-4888/pytorch_model.bin
tokenizer config file saved in ./results_6/checkpoint-4888/tokenizer_config.json
Special tokens file saved in ./results_6/checkpoint-4888/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4496

Validation metrics:


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3245
  Batch size = 16


{'eval_loss': 0.9281078577041626, 'eval_accuracy': 0.8947953581809998, 'eval_runtime': 153.9566, 'eval_samples_per_second': 29.203, 'eval_steps_per_second': 1.825, 'epoch': 15.0}
Test metrics:


Configuration saved in ./trained_model_6/config.json


{'eval_loss': 3.3944590091705322, 'eval_accuracy': 0.6027734875679016, 'eval_runtime': 110.6782, 'eval_samples_per_second': 29.319, 'eval_steps_per_second': 1.834, 'epoch': 15.0}


Model weights saved in ./trained_model_6/pytorch_model.bin
tokenizer config file saved in ./trained_model_6/tokenizer_config.json
Special tokens file saved in ./trained_model_6/special_tokens_map.json
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3245
  Batch size = 16


AttributeError: 'Index' object has no attribute '_format_native_types'

In [23]:
print(final_df.columns)

Index(['text', 'label', 'label_pred', 'logit_0', 'logit_1', 'logit_2'], dtype='object')


In [24]:
print(final_df.head())

                                                text label label_pred   
0  Im scared : This is it. I lie to myself every ...     1          1  \
1  New to this but just wanted to vent : I just f...     1          1   
2  I’m sad : It’s kinda always been an issue. I w...     1          2   
3  Lonely but not alone. : All of my immediately ...     1          1   
4  This year has been trash. : I dont know why I’...     1          1   

      logit_0      logit_1     logit_2  
0  -4.2713637    6.9045553  -3.2796314  
1  -4.7947664    6.3023434  -2.1778367  
2   -5.588239  -0.31869474    5.529555  
3  -4.2503605     6.968496  -3.3688626  
4  -3.9920104    6.9639244  -3.6148562  


In [22]:
print(final_df.isnull().sum())

text          0
label         0
label_pred    0
logit_0       0
logit_1       0
logit_2       0
dtype: int64


In [18]:
new_df = pd.DataFrame(final_df.copy())
new_df.to_csv("test_predictions_with_logits_6.csv", index=False)

AttributeError: 'Index' object has no attribute '_format_native_types'

In [26]:
# Filter out misclassified samples
misclassified_df = results_df[results_df['label'] != results_df['label_pred']]
# misclassified_df.to_csv('misclassified_samples_6.csv', index=False)

# Count per label
misclassified_counts = misclassified_df['label'].value_counts()

# Display distribution of misclassified labels
print(f"Misclassification distribution:\n{misclassified_counts}")

Misclassification distribution:
label
1    753
2    422
0    114
Name: count, dtype: int64


### No. 8

In [27]:
# Load data
train_df = pd.read_csv('Preprocessed_Dataset/train.csv')
val_df = pd.read_csv('Preprocessed_Dataset/dev.csv')
test_df = pd.read_csv('Preprocessed_Dataset/test.csv')

# Convert DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge datasets
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Tokenizer: Load from last training
model_checkpoint = './trained_model_6'
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# Tokenize
def preprocess_function(examples):
    cleaned_texts = [str(text) if text else "" for text in examples["text"]]
    return tokenizer(cleaned_texts, truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the model
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, ignore_mismatched_sizes=True)  # 2 classes

# Data Collector
data_collator = DataCollatorWithPadding(tokenizer)

# Evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = torch.argmax(torch.tensor(predictions), dim=-1)
    accuracy = (preds == torch.tensor(labels)).float().mean().item()
    return {"accuracy": accuracy}

# parameters
training_args = TrainingArguments(
    output_dir="./results_8",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir='./logs_8',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# learning rate scheduler
num_training_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=num_training_steps)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2) 

# Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Train
trainer.train()

# Evaluate the model
print("Validation metrics:")
val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print(val_metrics)

print("Test metrics:")
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(test_metrics)

# Save
model.save_pretrained('./trained_model_8')
tokenizer.save_pretrained('./trained_model_8')

# Get predictions for the test set using the trained model
def get_predictions_and_logits(trainer, dataset):
    predictions, labels, metrics = trainer.predict(dataset)
    
    # Convert logits to predicted labels (0 or 1)
    predicted_labels = np.argmax(predictions, axis=-1)

    return predictions, predicted_labels, labels

# Get predictions, logits, and true labels for the test dataset
predictions, predicted_labels, true_labels = get_predictions_and_logits(trainer, tokenized_datasets["test"])

# Convert logits to pandas DataFrame for easy manipulation
logits_df_8 = pd.DataFrame(predictions, columns=["logit_" + str(i) for i in range(predictions.shape[1])])

# Create a DataFrame with text, true labels, and predicted labels
results_df_8 = pd.DataFrame({
    'text': tokenized_datasets["test"]["text"],
    'label': true_labels,
    'label_pred': predicted_labels
})

# Concatenate logits DataFrame with the results_df
final_df_8 = pd.concat([results_df_8, logits_df_8], axis=1)
# final_df_8.to_csv("test_predictions_with_logits_8.csv", index=False)

# Filter out misclassified samples
misclassified_df_8 = results_df_8[results_df_8['label'] != results_df_8['label_pred']]
# misclassified_df_8.to_csv('misclassified_samples_8.csv', index=False)

# Count per label
misclassified_counts_8 = misclassified_df_8['label'].value_counts()

# Display distribution
print(f"Misclassification distribution:\n{misclassified_counts_8}")

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file ./trained_model_6/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ./trained_model_6/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForSequenceClassification.

Some weights of RobertaForSeque

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2708,0.136914,0.958603
2,0.1879,0.127784,0.965071
3,0.0941,0.111236,0.972833
4,0.0294,0.135969,0.976714
5,0.0031,0.203774,0.967658
6,0.0375,0.197683,0.972833


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 773
  Batch size = 16
Saving model checkpoint to ./results_8/checkpoint-387
Configuration saved in ./results_8/checkpoint-387/config.json
Model weights saved in ./results_8/checkpoint-387/pytorch_model.bin
tokenizer config file saved in ./results_8/checkpoint-387/tokenizer_config.json
Special tokens file saved in ./results_8/checkpoint-387/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 773
  Batc

Validation metrics:


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 774
  Batch size = 16


{'eval_loss': 0.1359693557024002, 'eval_accuracy': 0.9767140746116638, 'eval_runtime': 24.9891, 'eval_samples_per_second': 30.933, 'eval_steps_per_second': 1.961, 'epoch': 6.0}
Test metrics:


Configuration saved in ./trained_model_8/config.json


{'eval_loss': 0.17540614306926727, 'eval_accuracy': 0.9689922332763672, 'eval_runtime': 24.8513, 'eval_samples_per_second': 31.145, 'eval_steps_per_second': 1.972, 'epoch': 6.0}


Model weights saved in ./trained_model_8/pytorch_model.bin
tokenizer config file saved in ./trained_model_8/tokenizer_config.json
Special tokens file saved in ./trained_model_8/special_tokens_map.json
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 774
  Batch size = 16


Misclassification distribution:
label
1    753
2    422
0    114
Name: count, dtype: int64


In [28]:
# Display distribution
print(f"Misclassification distribution:\n{misclassified_counts_8}")

Misclassification distribution:
label
1    15
0     9
Name: count, dtype: int64
