<a href="https://colab.research.google.com/github/RyuichiSaito1/covid19-twitter-usa-restoring/blob/main/roberta_large_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

In [None]:
!pip install transformers
!pip install torch
!pip install accelerate
!pip install --upgrade accelerate
# After installing, restart the runtime.

In [None]:
import pandas as pd

train_file_path = '/content/drive/MyDrive/covid-twitter-usa-normal/data/training_data/gpt-3.5/training_data_2021_shuffle_majority_vote_gpt3.5.tsv'
val_file_path = '/content/drive/MyDrive/covid-twitter-usa-normal/data/training_data/gpt-3.5/validation_data_2021_shuffle_majority_vote_gpt3.5.tsv'

# Load the training data
train_data = pd.read_table(train_file_path, names=['text', 'label'], dtype='object', engine='python')

# Load the validation data
val_data = pd.read_table(val_file_path, names=['text', 'label'], dtype='object', engine='python')

# Convert the string labels to integers
y_train = [int(label) for label in train_data['label']]
y_val = [int(label) for label in val_data['label']]


In [None]:
from transformers import RobertaTokenizer

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Encode the training data
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True)

# Encode the validation data
val_encodings = tokenizer(val_data['text'].tolist(), truncation=True, padding=True)

In [None]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the training dataset
train_dataset = TweetDataset(train_encodings, y_train)

# Create the validation dataset
val_dataset = TweetDataset(val_encodings, y_val)

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from accelerate import Accelerator

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/covid-twitter-usa-normal/models/roberta-large/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/covid-twitter-usa-normal/models/roberta-large/logs',
    logging_steps=10,
    save_strategy='epoch',
)

# Initialize the model
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=3)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()