In [1]:
import os
import sys
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Import data

sheet_url = "https://docs.google.com/spreadsheets/d/1_LWErSQCdNm2O1r7ZxszgblbXNbrIDu3eBBDvkaJ_C0/edit?gid=1349519076"
sheet_id = sheet_url.split("/d/")[1].split("/")[0]
csv_export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid=1349519076"
df = pd.read_csv(csv_export_url)
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
# remove user name
df = pd.DataFrame(df)

def content_reformat(text):
  tokens = text.split()
  if tokens and tokens[0].startswith('@'):
      tokens = tokens[1:]
  return ' '.join(tokens)

df['content'] = df['content'].apply(content_reformat)
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,i know i was listenin to bad habit earlier and...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,We want to trade with someone who has Houston ...


In [4]:
df_convert = df.copy()
# categorical labels
df_convert['sentiment_label'] = df_convert['sentiment'].astype('category').cat.codes
# map to original content for refernce
map_label = dict(enumerate(df_convert['sentiment'].astype('category').cat.categories))
df_convert['sentiment'] = df_convert['sentiment_label'].map(map_label)
df_convert.head()

Unnamed: 0,tweet_id,sentiment,content,sentiment_label
0,1956967341,empty,i know i was listenin to bad habit earlier and...,2
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on...,10
2,1956967696,sadness,Funeral ceremony...gloomy friday...,10
3,1956967789,enthusiasm,wants to hang out with friends SOON!,3
4,1956968416,neutral,We want to trade with someone who has Houston ...,8


In [5]:
import torch, math, random, itertools
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

In [12]:
seed = 21
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [13]:
#  Input: list, Output:
sentence = df_convert['content'].tolist()
label = df_convert['sentiment_label'].tolist()
len(sentence), len(label)
# sentence[0]
sentence[2], label[2]

('Funeral ceremony...gloomy friday...', 10)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_input, test_input, train_labels, test_labels = train_test_split(
    sentence, label, test_size = 0.2, random_state = 21)

In [17]:
# tokenization
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer_train = tokenizer(
    train_input,
    padding = True,
    truncation = True,
    return_tensors = 'pt',
    max_length = 128
)
tokenizer_test = tokenizer(
    test_input,
    padding = True,
    truncation = True,
    return_tensors = 'pt',
    max_length = 128
)

In [20]:
from datasets import Dataset

In [21]:
train_labels_tensor = torch.tensor(train_labels)
test_labels_tensor = torch.tensor(test_labels)

In [23]:
# roBERTa dataset
roberta_train_ds = Dataset.from_dict({
    'input_ids': tokenizer_train['input_ids'],
    'attention_mask': tokenizer_train['attention_mask'],
    'labels': train_labels
})

roberta_test_ds = Dataset.from_dict({
    'input_ids': tokenizer_test['input_ids'],
    'attention_mask': tokenizer_test['attention_mask'],
    'labels': test_labels
})
# stick with paper parameters 8, random seed 21, epoch 10, learning rate, 5e-5
train_loader = DataLoader(roberta_train_ds, batch_size = 8, shuffle = True)
test_loader = DataLoader(roberta_test_ds, batch_size = 8, shuffle = True)

In [24]:
batch = next(iter(train_loader))
input_ids_batch = batch['input_ids']
attention_mask_batch = batch['attention_mask']
labels_batch = batch['labels']

# Convert token IDs back to text
first_sentence_tokens = tokenizer.convert_ids_to_tokens(input_ids_batch[0])
first_sentence_text = tokenizer.decode(input_ids_batch[0], skip_special_tokens=True)

print("Token IDs:", input_ids_batch[0])
print("Tokens:", first_sentence_tokens)
print("Decoded text:", first_sentence_text)
print("Label:", labels_batch[0].item(), "->", map_label[labels_batch[0].item()])


Token IDs: tensor([0, 0, 0, 0, 0, 0, 0, 0])
Tokens: ['<s>', '<s>', '<s>', '<s>', '<s>', '<s>', '<s>', '<s>']
Decoded text: 
Label: 8 -> neutral


In [25]:
num_labels = len(map_label)
roberta_model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels = num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir = './results',
    #overwrite_output_dir = True,
    learning_rate = 5e-5,
    eval_strategy="epoch",
    seed = seed,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    logging_dir = './logs',
    report_to = []
)

trainer = Trainer(
    model = roberta_model,
    args = training_args,
    train_dataset = roberta_train_ds,
    eval_dataset = roberta_test_ds,
    tokenizer = tokenizer
)

train_output = trainer.train()
# Print training loss
print(f"Training loss: {train_output.training_loss}")
# Also evaluate to see eval loss after training
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
