Import Libraries

In [14]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

Load Dataset

In [3]:
df = pd.read_csv('E:\Datasets\sentiment_tweets3.csv')
df.rename(columns={'message to examine': 'messages'}, inplace=True)
df.rename(columns={'label (depression result)': 'sentiment'}, inplace=True)
df.head()

Unnamed: 0,Index,messages,sentiment
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


Pre-processing of Datasets

In [8]:
def preprocess_tweet(tweet):
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    return " ".join(tweet_words)

df['preprocessed_tweet'] = df['messages'].apply(preprocess_tweet)




In [15]:
roberta_model = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta_model)
tokenizer = AutoTokenizer.from_pretrained(roberta_model)

In [13]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, tweets, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(tweets, padding=True, truncation=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tweets = df['preprocessed_tweet'].tolist()
labels = df['sentiment'].tolist()
dataset = SentimentDataset(tweets, labels, tokenizer)



Fine-tuning Model

In [6]:
training_args = TrainingArguments(
    output_dir="./sentiment_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    save_steps=1000,
    save_total_limit=2,
    do_train=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

model.save_pretrained('./fine_tuned_model(NewRoBERTa)')
tokenizer.save_pretrained('./fine_tuned_tokenizer')



  0%|          | 0/969 [00:00<?, ?it/s]

KeyboardInterrupt: 