In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [None]:

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)


In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", split='train', data_files="../dataset/data.csv")

In [None]:
def tokenize(examples):
    token = tokenizer(examples['clean_text'], truncation=True, padding="max_length")
    return token

tokenized_dataset = dataset.map(tokenize)

In [None]:

no = 10
text = dataset['clean_text'][no]
text = preprocess(text)
sentiment = dataset['sentiment'][no]

test_input = tokenizer(text, return_tensors='pt')
print(test_input)

all_encodings = []
for tweet in dataset['clean_text']:
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    all_encodings.append(encoded_tweet)


In [None]:
for i, encoded_tweet in enumerate(all_encodings[:10]):
    output = model(**encoded_tweet)
    # print(output.losses, end = '\t')
    score = output[0][0].detach().numpy()
    score = softmax(score)
    print(np.argmax(score) - 1, end='\t')
    print(dataset['clean_text'][i])

In [None]:

scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]

for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")



In [None]:
from transformers import TFTrainingArguments
training_args = TFTrainingArguments("my_model")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer)

In [None]:
from transformers import AdamW
import torch
train_loader = torch.utils.data.DataLoader(
    tokenized_dataset,
    shuffle=True,
)


In [None]:

optimizer = AdamW(model.parameters(), lr=5e-5)
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
import torch
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(batch['input_ids'][0])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)