My code references the following websites：

Kaggle. (2021). NLP Getting Started - Disaster Tweets. Retrieved April 7, 2023, from https://www.kaggle.com/c/nlp-getting-started

Twitter US Airline Sentiment Dataset, Accessed on Kaggle, https://www.kaggle.com/crowdflower/twitter-airline-sentiment

Bentrevett. (2021). pytorch-sentiment-analysis [Computer software]. GitHub. https://github.com/bentrevett/pytorch-sentiment-analysis
 
Citation for the code I used: 

Abdal, R., & Goyal, A. (2019). RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692.Hugging Face. (n.d.). RoBERTa. https://huggingface.co/roberta-base



In [1]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 KB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.18.0


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [3]:
import re
import string
from typing import List
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from accelerate import Accelerator
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
def clean_tweet(data: List[str]):
    out = []
    for tweet in data:
        # Remove HTML special entities (e.g. &amp;)
        tweet = re.sub(r'\&\w*;', '', tweet)

        # Remove URLs
        tweet = re.sub(r'https?://[A-Za-z0-9./]+', '', tweet)

        # Remove RTs
        tweet = re.sub(r'RT @[\w_]+:', '', tweet)

        # Remove twitter handles (@xxx)
        tweet = re.sub(r'@[\w_]+', '', tweet)

        # Remove punctuation
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))

        # Remove additional white spaces
        tweet = re.sub(r' +', ' ', tweet)

        # Convert to lowercase
        tweet = tweet.lower()

        # Remove special characters and numbers
        tweet = re.sub(r'[^a-zA-Z#]', ' ', tweet)

        # Remove additional white spaces
        tweet = re.sub(r' +', ' ', tweet)
        # remove the space from front and bacl
        tweet = tweet.strip()
        if len(tweet) == 0:
            continue
        out.append(tweet)
    return out

In [5]:
class LocalDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings[idx]['input_ids'],
            "attention_mask": self.encodings[idx]['attention_mask'],
            "labels": self.labels[idx] if self.labels is not None else None
        }
        return item

    def __len__(self):
        return len(self.encodings)


In [6]:
def collate_fn(batch):
    # print(batch)

    # caculate the max length
    max_len = max([len(item['input_ids'][0]) for item in batch])
    # print(f"max_len: {max_len}")
    # padding
    for item in batch:
        item['input_ids'] = item['input_ids'][0] + [1] * (max_len - len(item['input_ids'][0]))
        item['attention_mask'] = item['attention_mask'][0] + [0] * (max_len - len(item['attention_mask'][0]))
    input_ids = torch.tensor([item['input_ids'] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item['attention_mask'] for item in batch], dtype=torch.long)

    labels = torch.stack([item['labels'] for item in batch]) if batch[0]['labels'] is not None else None
    return (
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        },
        labels
    )


In [7]:
accelerater = Accelerator()
print(f"device count: {torch.cuda.device_count()}")

device count: 1


In [8]:
max_len = 512
batch_size = 32
epochs = 3
lr = 2e-5
skip_train = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Load data
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [10]:
# clean data
df['Text'] = df['Text'].apply(lambda x: clean_tweet([x]))
df_test['Text'] = df_test['Text'].apply(lambda x: clean_tweet([x]))

In [11]:
# Split data
train, val = train_test_split(df, test_size=0.2, random_state=42)

Citation for the code I used: 

Abdal, R., & Goyal, A. (2019). RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv preprint arXiv:1907.11692.Hugging Face. (n.d.). RoBERTa. https://huggingface.co/roberta-base


In [12]:
# Tokenize data
model_path = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
print(f"pad_token_id: {tokenizer.pad_token_id}")
# print(val['Text'].values.tolist())
train_encodings = train.apply(lambda x: tokenizer(x['Text'], truncation=True, max_length=max_len),
                              axis=1).tolist()
val_encodings = val.apply(lambda x: tokenizer(x['Text'], truncation=True, max_length=max_len),
                          axis=1).tolist()
test_encodings = df_test.apply(lambda x: tokenizer(x['Text'], truncation=True, max_length=max_len),
                               axis=1).tolist()

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pad_token_id: 1


In [13]:
# covert to tensor
train_labels = torch.tensor(train['Sentiment'].tolist())
val_labels = torch.tensor(val['Sentiment'].tolist())

In [14]:
# Create dataset
train_dataset = LocalDataset(train_encodings, train_labels)
val_dataset = LocalDataset(val_encodings, val_labels)
test_dataset = LocalDataset(test_encodings)

In [15]:
# Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [16]:
class Classifier(nn.Module):
    def __init__(self, model_name):
        super(Classifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.model.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return loss, logits
        else:
            return torch.softmax(logits, dim=1)


model = Classifier(model_path)

# model.to(device)
model_path = model_path.split('/')[0]

if not skip_train:
    # Train model
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    best_val_acc = 0
    model, optimizer, train_dataloader, val_dataloader = accelerater.prepare(model, optimizer, train_dataloader,
                                                                             val_dataloader)
    for epoch in range(epochs):
        model.train()
        for batch in tqdm(train_dataloader, total=len(train_dataloader), desc=f'Epoch {epoch}'):
            optimizer.zero_grad()
            # input_ids = batch[0]['input_ids'].to(device)
            # attention_mask = batch[0]['attention_mask'].to(device)
            # labels = batch[1].to(device)
            input_ids = batch[0]['input_ids']
            attention_mask = batch[0]['attention_mask']
            labels = batch[1]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            # loss.backward()
            accelerater.backward(loss)
            optimizer.step()

        model.eval()
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                # input_ids = batch[0]['input_ids'].to(device)
                # attention_mask = batch[0]['attention_mask'].to(device)
                # labels = batch[1].to(device)
                input_ids = batch[0]['input_ids']
                attention_mask = batch[0]['attention_mask']
                labels = batch[1]
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                # loss = outputs[0]
                val_preds.extend(torch.argmax(outputs[1], dim=1).cpu().numpy().tolist())
                val_labels.extend(labels.cpu().numpy().tolist())
        val_acc = accuracy_score(val_labels, val_preds)
        print(f'Epoch {epoch}, val_acc: {val_acc:.5f}')
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'model_{model_path}.pt')
        model = accelerater.unwrap_model(model)
else:
    model.load_state_dict(torch.load(f'model_{model_path}.pt'))

    # model, test_dataloader = accelerater.prepare(model, test_dataloader)
# Predict
model.eval()
model.to(device)
print(f"model device: {next(model.parameters()).device}")
test_preds = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch[0]['input_ids'].to(device)
        attention_mask = batch[0]['attention_mask'].to(device)

        # input_ids = batch[0]['input_ids']
        # attention_mask = batch[0]['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy().tolist())


Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Epoch 0, val_acc: 0.92999


Epoch 1: 100%|██████████| 625/625 [07:03<00:00,  1.48it/s]
100%|██████████| 157/157 [00:34<00:00,  4.60it/s]


Epoch 1, val_acc: 0.92919


Epoch 2: 100%|██████████| 625/625 [07:02<00:00,  1.48it/s]
100%|██████████| 157/157 [00:34<00:00,  4.59it/s]


Epoch 2, val_acc: 0.93179
model device: cuda:0


100%|██████████| 809/809 [02:56<00:00,  4.59it/s]


In [17]:
# Save result
df_test['Sentiment'] = test_preds
df_test[['Id', 'Sentiment']].to_csv(f'submission_{model_path}.csv', index=False)