#Imports

In [1]:
import torch
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


#Text cleaning Function

In [3]:
def preprocess_text(text):
    # Convert text to lowercase
    # print(text)
    text = text.lower()

    # Remove links
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)

    # Remove emojis
    text = re.sub(r'[\U0001f600-\U0001f650]', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    processed_text = ' '.join(words)

    return processed_text

#Preparing the Dataset

NTLK downloads check

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Using Twitter Tweet Sentiment Dataset, Link: https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset

The CSV file is placed in my drive, to use this change the path appropriately with your path of csv

In [7]:
df = pd.read_csv('Tweets.csv', encoding='latin-1')

In [8]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


Check for any NaN values

In [9]:
df.isna().any()

textID           False
text              True
selected_text     True
sentiment        False
dtype: bool

There are some, Let's drop them

In [10]:
df.dropna(inplace=True)

Applying the Preprocessing function to all texts

In [11]:
df['text'] = df['text'].apply(preprocess_text)

Storing the preprocessed data 

In [12]:
df.to_csv('tweets_preprocessed_data.csv', index=False)

Reloading it back

In [13]:
df = pd.read_csv('tweets_preprocessed_data.csv', encoding='latin-1')

Checking for NaN again as preprocessing some texts might have sliced some texts to NaN

In [14]:
df.isna().any()

textID           False
text              True
selected_text    False
sentiment        False
dtype: bool

Drop the NaN entries

In [15]:
df.dropna(inplace=True)

In [16]:
print(df)

           textID                                               text  \
0      cb774db0d1                                 id responded going   
1      549e992a42                            sooo sad miss san diego   
2      088c60f138                                       bos bullying   
3      9642c003ef                              interview leave alone   
4      358bd9e861             son couldnt put release already bought   
...           ...                                                ...   
27475  4eac33d1c0  wish could come see u denver husband lost job ...   
27476  4f4c4fc327  ive wondered rake client made clear net dont f...   
27477  f67aae2310  yay good enjoy break probably need hectic week...   
27478  ed167662a5                                              worth   
27479  6f7127d9d7                   flirting going atg smile yay hug   

                                           selected_text sentiment  
0                    I`d have responded, if I were going   neutral

"selected_text" column is unnecessary, so let's drop it

In [17]:
df.drop('selected_text', axis=1, inplace=True)


Final DataFrame

In [18]:
print(df)

           textID                                               text sentiment
0      cb774db0d1                                 id responded going   neutral
1      549e992a42                            sooo sad miss san diego  negative
2      088c60f138                                       bos bullying  negative
3      9642c003ef                              interview leave alone  negative
4      358bd9e861             son couldnt put release already bought  negative
...           ...                                                ...       ...
27475  4eac33d1c0  wish could come see u denver husband lost job ...  negative
27476  4f4c4fc327  ive wondered rake client made clear net dont f...  negative
27477  f67aae2310  yay good enjoy break probably need hectic week...  positive
27478  ed167662a5                                              worth  positive
27479  6f7127d9d7                   flirting going atg smile yay hug   neutral

[27414 rows x 3 columns]


Randomize to get proper split in Train and Test 

In [19]:
# Randomize the DataFrame
randomized_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

Encode Sentiment column with **0 for Positive, 1 for Negative and 2 for Neutral**

In [20]:
# Replace values in the 'sentiment' column
randomized_df['sentiment'] = randomized_df['sentiment'].replace({'positive': 0, 'negative': 1, 'neutral': 2})

Split the df in train-test ratio of 80-20

In [21]:
train_split=int(len(randomized_df)*0.8)

In [22]:
df_train = randomized_df[:train_split]
df_test = randomized_df[train_split:]

Train split

In [23]:
df_train

Unnamed: 0,textID,text,sentiment
0,17bd5233e8,oh oh going shopping best friend today yaay go...,0
1,a01b1e199e,hate,1
2,d107a76c20,ill waiting hope doesnt rain,0
3,a15a5d5645,lowerleft molar sting whenever come contact su...,2
4,378bf58fa3,lovely lunch curried rice mussel baby octopus yum,0
...,...,...,...
21926,1a54b8916c,happy mother day,0
21927,6d6ce612a1,yay cant wait read,0
21928,4a627b101f,get fightstar mercury summer,2
21929,4634855b53,home sweet home sleeping monday hope,2


#Training/Finetuning

Installing Transformers to use RoBERTa Architecture from hugging face

In [24]:
#May differ for different environment of run
#for vscode it will be '%' instead of '!'
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [25]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

Loading the RoBERTa pretrained model on trained over ~58M tweets on Twitter and finetuned for sentiment analysis and Fine Tune it for our task

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL,is_split_into_words=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [27]:
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer):
        """
        A custom dataset class for sentiment analysis.
        
        Args:
        - data: The input DataFrame containing text and sentiment labels.
        - tokenizer: The tokenizer object for tokenizing the text.
        """
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        """
        Returns the length of the dataset.
        """
        return len(self.data)
    
    def __getitem__(self, idx):
        """
        Retrieves an item from the dataset at the given index.
        
        Args:
        - idx: The index of the item to retrieve.
        
        Returns:
        A dictionary containing the tokenized input, attention mask, and label tensors.
        """
        row = self.data.iloc[idx]
        label = row['sentiment']
        text = row['text']
      
        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(label)
        }

# Create the training dataset and dataloader
tr_dataset = SentimentDataset(df_train, tokenizer)
tr_dataloader = DataLoader(tr_dataset, batch_size=16, shuffle=True)

# Create the testing dataset and dataloader
te_dataset = SentimentDataset(df_test, tokenizer)
te_dataloader = DataLoader(te_dataset, batch_size=16, shuffle=False)


In [28]:
# # Test
# for i in tr_dataloader:
#   print(i['label'])

Dataset Class to tokenize the words and make them tensors 

In [29]:
# Set the device to CUDA if a GPU is available, otherwise set it to CPU
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

# Move the model to the selected device for computation
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [30]:
# Fine-tuning settings
num_epochs = 2
learning_rate = 2e-5

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

In [31]:
from tqdm import tqdm

# Fine-tuning loop
for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    # Wrap the dataloader with tqdm for the progress bar
    progress_bar = tqdm(tr_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
    
    for batch in progress_bar:
        # Move the batch tensors to the device for computation
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Compute the loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the progress bar with the current loss
        progress_bar.set_postfix({'Loss': loss.item()})
    
    # Print average loss for the epoch
    avg_loss = total_loss / len(tr_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")




Epoch 1/2 - Loss: 0.6490


                                                                          

Epoch 2/2 - Loss: 0.5098




Here's a more detailed explanation of Fine Tuning Step:

- The loop iterates over the specified number of epochs using the num_epochs variable.
- total_loss is initialized to 0 to keep track of the cumulative loss during the epoch.
- The model is set to training mode using model.train(). This ensures that the model is in a state where gradients are computed and parameters are updated during training.
- The tqdm function is used to create a progress bar that wraps around the training data loader (tr_dataloader). This provides a visual representation of the training progress with a description indicating the current epoch and total number of epochs.
- Within each epoch, the training data is iterated over in batches. Each batch consists of input_ids, attention_mask, and labels.
- The input tensors (input_ids, attention_mask, and labels) are moved to the appropriate device (GPU if available) using .to(device).
- The forward pass is performed by calling the model with the input tensors. - - This generates the predicted logits for each input.
- The loss is computed by comparing the logits with the ground truth labels using the specified loss function (loss_fn).
- The loss value is accumulated in total_loss for the current epoch.
- The gradients are reset to zero using optimizer.zero_grad() to prevent gradient accumulation.
- The backward pass is performed by calling loss.backward(), which computes the gradients of the loss with respect to the model's parameters.
- The optimizer takes a step based on the computed gradients to update the model's parameters using optimizer.step().
- The progress bar is updated with the current loss value using progress_bar.- - After each epoch, the average loss is computed by dividing total_loss by the number of batches in the training data loader (len(tr_dataloader)).
- The average loss for the epoch is printed indicating the epoch number and the calculated loss value.

#Saving the Finetuned model

In [32]:
# Save the fine-tuned model
model.save_pretrained("SA_roberta_model/model")
tokenizer.save_pretrained("SA_roberta_model/tokenizer")

('SA_roberta_model/tokenizer/tokenizer_config.json',
 'SA_roberta_model/tokenizer/special_tokens_map.json',
 'SA_roberta_model/tokenizer/vocab.json',
 'SA_roberta_model/tokenizer/merges.txt',
 'SA_roberta_model/tokenizer/added_tokens.json',
 'SA_roberta_model/tokenizer/tokenizer.json')

In [33]:
# #Saving it in my drive for ease
# !zip -r drive/MyDrive/SA_roberta_model.zip /content/SA_roberta_model

  adding: content/SA_roberta_model/ (stored 0%)
  adding: content/SA_roberta_model/tokenizer/ (stored 0%)
  adding: content/SA_roberta_model/tokenizer/special_tokens_map.json (deflated 52%)
  adding: content/SA_roberta_model/tokenizer/tokenizer_config.json (deflated 80%)
  adding: content/SA_roberta_model/tokenizer/tokenizer.json (deflated 72%)
  adding: content/SA_roberta_model/tokenizer/vocab.json (deflated 59%)
  adding: content/SA_roberta_model/tokenizer/merges.txt (deflated 53%)
  adding: content/SA_roberta_model/model/ (stored 0%)
  adding: content/SA_roberta_model/model/config.json (deflated 50%)
  adding: content/SA_roberta_model/model/pytorch_model.bin (deflated 7%)


#Testing



In [34]:
# !unzip drive/MyDrive/SA_roberta_model.zip

**Important**: for below code blocks to works kindly run Text clean function, Preparing dataset section and imports at the start

In [35]:
#Reload the Fine Tuned Model
from transformers import RobertaTokenizer, RobertaForSequenceClassification

#Redefining device 
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

#Please change it accordingly with your path of model folder 
model = RobertaForSequenceClassification.from_pretrained('SA_roberta_model/model').to(device)
tokenizer = RobertaTokenizer.from_pretrained('SA_roberta_model/tokenizer', is_split_into_words=True)

In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_accuracy(model, dataset):
    # Evaluation settings
    model.eval()

    # Variables to keep track of correct predictions and total samples
    correct_predictions = 0
    total_samples = 0

    # Lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []

    # Iterate over the dataset
    for batch in dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Disable gradient calculations
        with torch.no_grad():
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        # Predicted labels
        batch_predicted_labels = torch.argmax(logits, dim=1)

        # Update the counts
        correct_predictions += torch.sum(batch_predicted_labels == labels).item()
        total_samples += len(labels)

        # Append true labels and predicted labels
        true_labels.extend(labels.tolist())
        predicted_labels.extend(batch_predicted_labels.tolist())

    # Calculate accuracy
    accuracy = correct_predictions / total_samples

    # Calculate precision, recall, and F1-score
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    return accuracy, precision, recall, f1


In [37]:
accuracy, precision, recall, f1 = calculate_accuracy(model, te_dataloader)

Result Display

In [38]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

Accuracy:  0.7711107058179829
Precision:  0.77311769702953
Recall:  0.7711107058179829
F1 Score:  0.7709146124347152


#Function to get setiment of given text as input

In [39]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

def get_sentiment_score(text, model_path, tokenizer_path):
    try:
        # Load the pre-trained model and tokenizer
        model = RobertaForSequenceClassification.from_pretrained(model_path).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path, is_split_into_words=True)
        
        # Preprocess the text
        text = preprocess_text(text)
        
        # Tokenize the text
        encoded_input = tokenizer.encode_plus(
            text,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(device)
        
        # Forward pass through the model
        outputs = model(**encoded_input)
        logits = outputs.logits
        
        # Get the predicted sentiment label
        _, predicted_label = torch.max(logits, dim=1)
        
        # Convert the predicted label to sentiment score
        sentiment_score = predicted_label.item()
        
        return sentiment_score
    
    except Exception as e:
        # Handle any exceptions and provide meaningful error messages
        error_msg = f"Error occurred during sentiment scoring: {str(e)}"
        raise ValueError(error_msg)


##Example Usage

In [40]:
get_sentiment_score( "you are, you don't know but you indeed are, an magnificient and genius person", 'SA_roberta_model/model', 'SA_roberta_model/tokenizer')

0

In [46]:
get_sentiment_score( "you are a genius Idiot",'SA_roberta_model/model', 'SA_roberta_model/tokenizer')

1

In [42]:
get_sentiment_score( "you are, you don't know but you indeed are",'SA_roberta_model/model', 'SA_roberta_model/tokenizer')

2

In [44]:
get_sentiment_score( "Hello ",'SA_roberta_model/model', 'SA_roberta_model/tokenizer')

2

Although It doesn't understands pure sarcasms pretty well

The below should be negative as it is a sarcasm, unless the person in question is a hobbists who love doing chores, but that won't usually apply to most peoples

In [None]:
get_sentiment_score( "Wow, I'm so thrilled to spend my entire weekend cleaning the house and doing laundry. What an amazing way to relax!",'SA_roberta_model/model', 'SA_roberta_model/tokenizer')

0