In [4]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [5]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [6]:

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from datasets import load_dataset

dataset = load_dataset("csv", split='train', data_files="../dataset/data.csv")

In [16]:
def tokenize(examples):
    token = tokenizer(examples['clean_text'], truncation=True, padding="max_length")
    return token

tokenized_dataset = dataset.map(tokenize)
print(tokenized_dataset[0])

{'clean_text': 'aaplthe 10 best steve jobs emails ever', 'sentiment': 0.0, 'input_ids': [0, 6621, 2911, 627, 158, 275, 11235, 548, 1315, 5575, 655, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:

no = 10
text = dataset['clean_text'][no]
text = preprocess(text)
sentiment = dataset['sentiment'][no]

test_input = tokenizer(text, return_tensors='pt')
print(test_input)

all_encodings = []
for tweet in dataset['clean_text']:
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    all_encodings.append(encoded_tweet)


{'input_ids': tensor([[    0,   605, 41407,   127,  3822,    21,  1105,    65,   200,   536,
             8,   122,    16,  1132,   885, 41407,    16,    42,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [10]:
for i, encoded_tweet in enumerate(all_encodings[:10]):
    output = model(**encoded_tweet)
    # print(output.losses, end = '\t')
    score = output[0][0].detach().numpy()
    score = softmax(score)
    print(np.argmax(score) - 1, end='\t')
    print(dataset['clean_text'][i])

1	aaplthe 10 best steve jobs emails ever
-1	rt why aapl stock had a miniflash crash today aapl aapl
-1	my cat only chews cords such an applesnob
1	i agree with that the individualinvestor should own not trade apple aapl its extended so todays pullback is good to see
-1	nobody expects the spanish inquisition aapl
0	aapl5 rocket stocks to buy for december gains apple and more
1	top 3 all tablets damn right
1	cnbctv apples margins better than expected aapl
0	apple inc flash crash what you need to know aapl
1	aaplthis presentation shows what makes the worlds biggest tech companies


In [11]:

scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]

for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")



1) positive 0.5212
2) neutral 0.4527
3) negative 0.0261


In [12]:
from transformers import TFTrainingArguments
training_args = TFTrainingArguments("my_model")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer)

2024-02-04 13:06:45.555221: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-04 13:06:45.957282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 13:06:45.957389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 13:06:45.999133: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-04 13:06:46.094365: I tensorflow/core/platform/cpu_feature_guar

In [15]:

trainer.train()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [13]:
from transformers import AdamW
import torch
train_loader = torch.utils.data.DataLoader(
    tokenized_dataset,
    shuffle=True,
)


In [14]:

optimizer = AdamW(model.parameters(), lr=5e-5)
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
import torch
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        print(batch)
        outputs = model(batch['input_ids'][0])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/33027 [00:00<?, ?it/s]

IndexError: too many indices for tensor of dimension 1