#### Implementation

In [1]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from torch.optim.lr_scheduler import ExponentialLR
from sklearn.model_selection import train_test_split

import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

----------------------------------------------------------
To fix the error `Torch compile: libcuda.so cannot found` raised by
```python
torch.compile(robertaModel, backend="inductor")
```
----------------------------------------------------------

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link



In [3]:
# Check for GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


#### Dataset

In [None]:
file_path = '/content/drive/Shareddrives/test/Sentiment/Tweets.csv'

df = pd.read_csv(file_path, usecols=['airline_sentiment', 'text'])
df.dropna(subset=['text'], inplace=True)
df.dropna(subset=['airline_sentiment'], inplace=True)

df['airline_sentiment'] = df['airline_sentiment'].map({'neutral': 2, 'positive': 1, 'negative': 0})

valid_sentiments = [0, 1]
df_valid = df[df['airline_sentiment'].isin(valid_sentiments)]

In [None]:
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner_updated(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [None]:
x = df_valid.text
# x = df_valid.text.map(lambda text: tweet_cleaner_updated(text))
y = df_valid.airline_sentiment
y = y.astype(int)

In [None]:
# Print Class Distribution Summary
print('Original class distribution summary: {}'.format(Counter(df_valid['airline_sentiment'])))

# Downsample Majority Class
rus = RandomUnderSampler(sampling_strategy='auto', random_state=2023)
df_valid['cleaned_text'] = df_valid['text'].apply(tweet_cleaner_updated)
# X_resampled, y_resampled = rus.fit_resample(df_valid[['text']], df_valid['airline_sentiment'])
X_resampled, y_resampled = rus.fit_resample(df_valid[['cleaned_text']], df_valid['airline_sentiment'])

# Create a new DataFrame with resampled data
df_resampled = pd.DataFrame({'text': X_resampled.squeeze(), 'airline_sentiment': y_resampled})

# Print Downsampled Class Distribution Summary
print('Downsampled class distribution summary: {}'.format(Counter(df_resampled['airline_sentiment'])))


# Now, df_resampled contains the downsampled dataset
x = df_resampled['text'].tolist()
y = df_resampled['airline_sentiment'].tolist()

Original class distribution summary: Counter({0: 9178, 1: 2363})


  soup = BeautifulSoup(text, 'lxml')


Downsampled class distribution summary: Counter({0: 2363, 1: 2363})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['cleaned_text'] = df_valid['text'].apply(tweet_cleaner_updated)


In [None]:
df_resampled.head()

Unnamed: 0,text,airline_sentiment
0,she could even see that had tried to make the ...,0
1,the pilot told us they would release bags as w...,0
2,jp dm message who can not get dm through to,0
3,hr min cost of flight change was hrs ago drop ...,0
4,we saw one he was as useless as the tsa agents...,0


#### Implementation

In [4]:
class SentimentAnalysisModel(nn.Module):
    def __init__(
        self,
        device,
        roberta_model_path='roberta-base',
        input_dim=512,
        units=256,
        channels=1,
        conv_size=32,
        kernel_size1=4,
        kernel_size2=6,
        dense_units=256,
        num_classes=1,
        inductor=True
        ):
        super(SentimentAnalysisModel, self).__init__()

        # Load pre-trained RoBERTa model
        self.roberta = RobertaModel.from_pretrained(roberta_model_path).to(device=device)
        if (inductor):
          self.roberta = torch.compile(self.roberta, backend="inductor")
        self.tokenizer = RobertaTokenizer.from_pretrained(roberta_model_path)

        # GRU branch
        self.gru = nn.GRU(input_dim, units, bidirectional=True, batch_first=True).to(device=device)
        self.attention_gru = AttentionWithContext(device, units * 2)

        # LSTM branch
        self.lstm = nn.LSTM(input_dim, units, bidirectional=True, batch_first=True).to(device=device)
        self.attention_lstm = AttentionWithContext(device, units * 2)

        # CNN branches
        self.channels = channels
        self.cnn1 = nn.Conv1d(channels, conv_size, kernel_size=kernel_size1, padding=0).to(device=device)
        self.cnn2 = nn.Conv1d(channels, conv_size, kernel_size=kernel_size2, padding=0).to(device=device)

        # Pooling layers
        self.avg_pool = nn.AdaptiveAvgPool1d(1).to(device=device)
        self.max_pool = nn.AdaptiveMaxPool1d(1).to(device=device)

        # Normalization
        self.norm = nn.BatchNorm1d(2 * 4 * conv_size).to(device=device)

        # Fully connected layers
        self.fc1 = nn.Linear(2 * 4 * conv_size, dense_units).to(device=device)
        self.relu = nn.ReLU().to(device=device)
        self.fc2 = nn.Linear(256, num_classes).to(device=device)
        self.sigmoid = nn.Sigmoid().to(device=device)

    def forward(self, x):
        # Tokenize and encode the sentences
        # tokenized_sentences = self.tokenizer(x, padding='max_length', return_tensors='pt').to(device=device)
        tokenized_sentences = self.tokenizer(x, truncation=True, padding='max_length', return_tensors='pt').to(device=device)
        # print('tokenized_sentences', tokenized_sentences.shape)

        # Forward pass to get embeddings
        with torch.no_grad():
            # Get RoBERTa embeddings
            model_output = self.roberta(**tokenized_sentences)

        # Extract embeddings from the output
        embeddings = model_output.last_hidden_state
        # print('embeddings', embeddings.shape)

        # GRU branch
        gru_out, _ = self.gru(embeddings.permute(0, 2, 1))
        # print('gru_out', gru_out.shape)
        gru_attention = self.attention_gru(gru_out)
        # print('gru_attention', gru_attention.shape)

        # LSTM branch
        lstm_out, _ = self.lstm(embeddings.permute(0, 2, 1))
        # print('lstm_out', lstm_out.shape)
        lstm_attention = self.attention_lstm(lstm_out)
        # print('lstm_attention', lstm_attention.shape)

        # Expand before CNN
        gru_attention_expand = gru_attention.unsqueeze(1).expand(-1, self.channels, -1)
        # print('gru_attention_expand', gru_attention_expand.shape)
        lstm_attention_expand = lstm_attention.unsqueeze(1).expand(-1, self.channels, -1)
        # print('lstm_attention_expand', lstm_attention_expand.shape)

        # CNN branches
        gru_cnn1_out = self.cnn1(gru_attention_expand)
        # print('gru_cnn1_out', gru_cnn1_out.shape)
        gru_cnn2_out = self.cnn2(gru_attention_expand)
        # print('gru_cnn2_out', gru_cnn2_out.shape)
        lstm_cnn1_out = self.cnn1(lstm_attention_expand)
        # print('lstm_cnn1_out', lstm_cnn1_out.shape)
        lstm_cnn2_out = self.cnn2(lstm_attention_expand)
        # print('lstm_cnn2_out', lstm_cnn2_out.shape)

        # Pooling
        gru_cnn1_avg_pool = self.avg_pool(gru_cnn1_out)
        # print('gru_cnn1_avg_pool', gru_cnn1_avg_pool.shape)
        gru_cnn1_max_pool = self.max_pool(gru_cnn1_out)
        # print('gru_cnn1_max_pool', gru_cnn1_max_pool.shape)
        gru_cnn2_avg_pool = self.avg_pool(gru_cnn2_out)
        # print('gru_cnn2_avg_pool', gru_cnn2_avg_pool.shape)
        gru_cnn2_max_pool = self.max_pool(gru_cnn2_out)
        # print('gru_cnn2_max_pool', gru_cnn2_max_pool.shape)

        lstm_cnn1_avg_pool = self.avg_pool(lstm_cnn1_out)
        # print('lstm_cnn1_avg_pool', lstm_cnn1_avg_pool.shape)
        lstm_cnn1_max_pool = self.max_pool(lstm_cnn1_out)
        # print('lstm_cnn1_max_pool', lstm_cnn1_max_pool.shape)
        lstm_cnn2_avg_pool = self.avg_pool(lstm_cnn2_out)
        # print('lstm_cnn2_avg_pool', lstm_cnn2_avg_pool.shape)
        lstm_cnn2_max_pool = self.max_pool(lstm_cnn2_out)
        # print('lstm_cnn2_max_pool', lstm_cnn2_max_pool.shape)

        # Concatenate and normalize
        concatenated = torch.cat(
            [
                gru_cnn1_avg_pool,
                gru_cnn1_max_pool,
                gru_cnn2_avg_pool,
                gru_cnn2_max_pool,
                lstm_cnn1_avg_pool,
                lstm_cnn1_max_pool,
                lstm_cnn2_avg_pool,
                lstm_cnn2_max_pool
            ],
             dim=1
            )
        # print('concatenated', concatenated.shape)
        normalized = self.norm(concatenated.view(concatenated.size(0), -1))
        # print('normalized', normalized.shape)

        # Fully connected layers
        fc1_out = self.fc1(normalized)
        # print('fc1_out', fc1_out.shape)
        fc1_relu = self.relu(fc1_out)
        # print('fc1_relu', fc1_relu.shape)
        fc2_out = self.fc2(fc1_relu)
        # print('fc2_out', fc2_out.shape)
        output = self.sigmoid(fc2_out)
        # print('output', output.shape)

        # Delete the embeddings tensor to release memory
        del embeddings

        return output

class AttentionWithContext(nn.Module):
    def __init__(self, device, hidden_size):
        super(AttentionWithContext, self).__init__()

        # Linear layers for attention
        self.W_query = nn.Linear(hidden_size, hidden_size).to(device=device)
        self.W_context = nn.Linear(hidden_size, hidden_size).to(device=device)
        self.V = nn.Linear(hidden_size, 1).to(device=device)

    def forward(self, lstm_output):
        # Compute query and context representations
        query = torch.tanh(self.W_query(lstm_output))
        context = torch.tanh(self.W_context(lstm_output))

        # Compute attention scores using dot-product attention
        attention_scores = self.V(query).squeeze(-1)
        attention_weights = F.softmax(attention_scores, dim=-1)

        # Apply attention weights to the LSTM output to get the context vector
        context_vector = torch.sum(attention_weights.unsqueeze(-1) * context, dim=1)

        return context_vector

In [None]:
sentences = ["I love this product!", "It's terrible.", "Awesome experience.", "Worst ever.", "Great job!", "Awful.", "Excellent service.", "Hate it.", "Fantastic!", "Disappointing."]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=2023)

# Create DataLoader for training and validation sets
batch_size = 128

train_dataset = list(zip(x_train, y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = list(zip(x_val, y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = SentimentAnalysisModel(device)

# Loss function and optimizer
criterion = nn.BCELoss().to(device=device)  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

d_r = 10**-10
scheduler = ExponentialLR(optimizer, gamma=(1.0 - d_r))

# Define early stopping parameters
patience = 5
best_validation_accuracy = 0
no_improvement_counter = 0

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute training accuracy
        predicted_labels_train = (outputs > 0.5).int()
        total_correct_train += (predicted_labels_train.view(-1) == labels.to(device=device)).sum().item()

        # Accumulate the sum of batch sizes
        total_samples_train += len(labels)

        loss = criterion(outputs, torch.as_tensor(labels, dtype=torch.float32).unsqueeze(1).to(device=device))
        loss.backward()
        optimizer.step()

    # Compute training accuracy
    train_accuracy = total_correct_train / total_samples_train

    # Validation
    val_loss = 0.0
    total_correct_val = 0
    total_samples_val = 0
    num_iterations = (len(y_val) + batch_size - 1) // batch_size

    model.eval()
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_outputs = model(val_inputs)

            # Compute validation accuracy
            predicted_labels_val = (val_outputs > 0.5).int()
            total_correct_val += (predicted_labels_val.view(-1) == val_labels.to(device=device)).sum().item()

            # Accumulate the sum of batch sizes
            total_samples_val += len(val_labels)

            val_batch_targets = torch.as_tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device=device)
            ce = criterion(val_outputs, val_batch_targets).item()
            val_loss += ce

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = total_correct_val / total_samples_val

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_val_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Check for early stopping
        if val_accuracy > best_validation_accuracy:
            best_validation_accuracy = val_accuracy
            no_improvement_counter = 0
            # Save the trained best model if needed
            torch.save(model.state_dict(), 'sentiment-analysis-twitterus-model.pth')
        else:
            no_improvement_counter += 1

        # If no improvement for 'patience' consecutive epochs, stop training
        if no_improvement_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 0.6845, Training Accuracy: 0.6392, Validation Accuracy: 0.4937
Epoch 2/10, Loss: 0.9682, Training Accuracy: 0.7111, Validation Accuracy: 0.4958
Epoch 3/10, Loss: 1.4524, Training Accuracy: 0.7545, Validation Accuracy: 0.5169
Epoch 4/10, Loss: 0.4328, Training Accuracy: 0.7966, Validation Accuracy: 0.7992
Epoch 5/10, Loss: 0.5670, Training Accuracy: 0.8198, Validation Accuracy: 0.6892
Epoch 6/10, Loss: 0.3999, Training Accuracy: 0.8384, Validation Accuracy: 0.8203
Epoch 7/10, Loss: 0.4350, Training Accuracy: 0.8447, Validation Accuracy: 0.7833
Epoch 8/10, Loss: 0.4940, Training Accuracy: 0.8521, Validation Accuracy: 0.7643
Epoch 9/10, Loss: 0.4110, Training Accuracy: 0.8630, Validation Accuracy: 0.8182
Epoch 10/10, Loss: 0.5051, Training Accuracy: 0.8646, Validation Accuracy: 0.7801


In [None]:
# Create an instance of the model
twitterusModel = SentimentAnalysisModel(device)

# Load the saved model state dictionary
twitterusModel.load_state_dict(torch.load('/content/drive/Shareddrives/test/FYP/sentiment/sentiment-analysis-twitterus-model.pth'))

# Validation
val_loss = 0.0
total_correct_val = 0
total_samples_val = 0
num_iterations = (len(y_val) + batch_size - 1) // batch_size

twitterusModel.eval()
with torch.no_grad():
    for val_inputs, val_labels in val_loader:
        val_outputs = twitterusModel(val_inputs)

        # Compute validation accuracy
        predicted_labels_val = (val_outputs > 0.5).int()
        total_correct_val += (predicted_labels_val.view(-1) == val_labels.to(device=device)).sum().item()

        # Accumulate the sum of batch sizes
        total_samples_val += len(val_labels)

        val_batch_targets = torch.as_tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device=device)
        ce = criterion(val_outputs, val_batch_targets).item()
        val_loss += ce

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = total_correct_val / total_samples_val

    print(f'Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loss: 0.3999, Validation Accuracy: 0.8203


In [None]:
sentences = ["I love this product!", "It's terrible.", "Awesome experience.", "Worst ever.", "Great job!", "Awful.", "Excellent service.", "Hate it.", "Fantastic!", "Disappointing."]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Split the data into training and validation sets
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=2023)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=2023)

# Create DataLoader for training and validation sets
batch_size = 128

train_dataset = list(zip(x_train, y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = list(zip(x_val, y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = SentimentAnalysisModel(device)

# Loss function and optimizer
criterion = nn.BCELoss().to(device=device)  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

d_r = 10**-10
scheduler = ExponentialLR(optimizer, gamma=(1.0 - d_r))

# Define early stopping parameters
patience = 5
best_validation_accuracy = 0
no_improvement_counter = 0

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute training accuracy
        predicted_labels_train = (outputs > 0.5).int()
        total_correct_train += (predicted_labels_train.view(-1) == labels.to(device=device)).sum().item()

        # Accumulate the sum of batch sizes
        total_samples_train += len(labels)

        loss = criterion(outputs, torch.as_tensor(labels, dtype=torch.float32).unsqueeze(1).to(device=device))
        loss.backward()
        optimizer.step()

    # Compute training accuracy
    train_accuracy = total_correct_train / total_samples_train

    # Validation
    val_loss = 0.0
    total_correct_val = 0
    total_samples_val = 0
    num_iterations = (len(y_val) + batch_size - 1) // batch_size

    model.eval()
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_outputs = model(val_inputs)

            # Compute validation accuracy
            predicted_labels_val = (val_outputs > 0.5).int()
            total_correct_val += (predicted_labels_val.view(-1) == val_labels.to(device=device)).sum().item()

            # Accumulate the sum of batch sizes
            total_samples_val += len(val_labels)

            val_batch_targets = torch.as_tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device=device)
            ce = criterion(val_outputs, val_batch_targets).item()
            val_loss += ce

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = total_correct_val / total_samples_val

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_val_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Check for early stopping
        if val_accuracy > best_validation_accuracy:
            best_validation_accuracy = val_accuracy
            no_improvement_counter = 0
            # Save the trained best model if needed
            torch.save(model.state_dict(), '/content/drive/Shareddrives/test/FYP/sentiment/sentiment-analysis-twitterus-all-model.pth')
        else:
            no_improvement_counter += 1

        # If no improvement for 'patience' consecutive epochs, stop training
        if no_improvement_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20, Loss: 0.4935, Training Accuracy: 0.7968, Validation Accuracy: 0.8043
Epoch 2/20, Loss: 0.4321, Training Accuracy: 0.8575, Validation Accuracy: 0.8398
Epoch 3/20, Loss: 0.7309, Training Accuracy: 0.8720, Validation Accuracy: 0.7446
Epoch 4/20, Loss: 0.4609, Training Accuracy: 0.8805, Validation Accuracy: 0.8268
Epoch 5/20, Loss: 0.3473, Training Accuracy: 0.8956, Validation Accuracy: 0.8563
Epoch 6/20, Loss: 0.3180, Training Accuracy: 0.8990, Validation Accuracy: 0.8814
Epoch 7/20, Loss: 0.2836, Training Accuracy: 0.9032, Validation Accuracy: 0.8701
Epoch 8/20, Loss: 0.3025, Training Accuracy: 0.9079, Validation Accuracy: 0.8727
Epoch 9/20, Loss: 0.5753, Training Accuracy: 0.9068, Validation Accuracy: 0.7758
Epoch 10/20, Loss: 0.5369, Training Accuracy: 0.9159, Validation Accuracy: 0.8182
Epoch 11/20, Loss: 0.4524, Training Accuracy: 0.9165, Validation Accuracy: 0.8320
Early stopping triggered. Training stopped.


#### Trying with fake news dataset

In [None]:
train_csv_url = "/content/drive/Shareddrives/test/FYP/fake-news/train.csv"
train_data = pd.read_csv(train_csv_url)
train_data.head()

filtered_train_data = train_data.copy()

# Remove missing values in "text" column
print(f"missing value count {filtered_train_data['text'].isna().sum()}")
filtered_train_data.dropna(subset=['text'], inplace=True)

# Check for empty strings and drop rows with empty "text" values
filtered_train_data['text'] = filtered_train_data['text'].str.strip() # Strip whitespace from the "text" column
print(f"empty string count {filtered_train_data[filtered_train_data['text'] == ''].shape[0]}")
filtered_train_data = filtered_train_data[filtered_train_data['text'] != '']
print(f"filtered_train_data dataset shape {filtered_train_data.shape}")

x_train, x_temp, y_train, y_temp = train_test_split(filtered_train_data['text'], filtered_train_data['label'], test_size=0.2, stratify=filtered_train_data['label'], random_state=2023)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=2023)

print(f"y_train\n {y_train.value_counts()}")
print(f"y_test\n {y_test.value_counts()}")
print(f"y_val\n {y_val.value_counts()}")

# Create DataLoader for training and validation sets
batch_size = 128

train_dataset = list(zip(x_train, y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = list(zip(x_val, y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = SentimentAnalysisModel(device)

# Loss function and optimizer
criterion = nn.BCELoss().to(device=device)  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

d_r = 10**-10
scheduler = ExponentialLR(optimizer, gamma=(1.0 - d_r))

# Define early stopping parameters
patience = 5
best_validation_accuracy = 0
no_improvement_counter = 0

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute training accuracy
        predicted_labels_train = (outputs > 0.5).int()
        total_correct_train += (predicted_labels_train.view(-1) == labels.to(device=device)).sum().item()

        # Accumulate the sum of batch sizes
        total_samples_train += len(labels)

        loss = criterion(outputs, torch.as_tensor(labels, dtype=torch.float32).unsqueeze(1).to(device=device))
        loss.backward()
        optimizer.step()

    # Compute training accuracy
    train_accuracy = total_correct_train / total_samples_train

    # Validation
    val_loss = 0.0
    total_correct_val = 0
    total_samples_val = 0
    num_iterations = (len(y_val) + batch_size - 1) // batch_size

    model.eval()
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_outputs = model(val_inputs)

            # Compute validation accuracy
            predicted_labels_val = (val_outputs > 0.5).int()
            total_correct_val += (predicted_labels_val.view(-1) == val_labels.to(device=device)).sum().item()

            # Accumulate the sum of batch sizes
            total_samples_val += len(val_labels)

            val_batch_targets = torch.as_tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device=device)
            ce = criterion(val_outputs, val_batch_targets).item()
            val_loss += ce

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = total_correct_val / total_samples_val

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_val_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Check for early stopping
        if val_accuracy > best_validation_accuracy:
            best_validation_accuracy = val_accuracy
            no_improvement_counter = 0
            # Save the trained best model if needed
            torch.save(model.state_dict(), '/content/drive/Shareddrives/test/FYP/sentiment/sentiment-analysis-fakenews-model.pth')
        else:
            no_improvement_counter += 1

        # If no improvement for 'patience' consecutive epochs, stop training
        if no_improvement_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

missing value count 39
empty string count 77
filtered_train_data dataset shape (20684, 5)
y_train
 0    8309
1    8238
Name: label, dtype: int64
y_test
 0    1039
1    1029
Name: label, dtype: int64
y_val
 0    1039
1    1030
Name: label, dtype: int64


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20, Loss: 0.2202, Training Accuracy: 0.8494, Validation Accuracy: 0.9101
Epoch 2/20, Loss: 0.1494, Training Accuracy: 0.9506, Validation Accuracy: 0.9386
Epoch 3/20, Loss: 0.2098, Training Accuracy: 0.9706, Validation Accuracy: 0.9343
Epoch 4/20, Loss: 0.1469, Training Accuracy: 0.9783, Validation Accuracy: 0.9435
Epoch 5/20, Loss: 0.1233, Training Accuracy: 0.9854, Validation Accuracy: 0.9546
Epoch 6/20, Loss: 0.1233, Training Accuracy: 0.9896, Validation Accuracy: 0.9560
Epoch 7/20, Loss: 0.1397, Training Accuracy: 0.9912, Validation Accuracy: 0.9575
Epoch 8/20, Loss: 0.5020, Training Accuracy: 0.9934, Validation Accuracy: 0.8951
Epoch 9/20, Loss: 0.0958, Training Accuracy: 0.9905, Validation Accuracy: 0.9700
Epoch 10/20, Loss: 0.2058, Training Accuracy: 0.9952, Validation Accuracy: 0.9507
Epoch 11/20, Loss: 0.1579, Training Accuracy: 0.9924, Validation Accuracy: 0.9468
Epoch 12/20, Loss: 0.1223, Training Accuracy: 0.9937, Validation Accuracy: 0.9594
Epoch 13/20, Loss: 0.1918

#### IMDB Dataset

In [5]:
file_path = '/content/drive/Shareddrives/test/Sentiment/IMDB Dataset.csv'

df = pd.read_csv(file_path, usecols=['sentiment', 'review'])
df.dropna(subset=['review'], inplace=True)
df.dropna(subset=['sentiment'], inplace=True)

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

valid_sentiments = [0, 1]
df_valid = df[df['sentiment'].isin(valid_sentiments)]
df_valid['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [6]:
x_train, x_temp, y_train, y_temp = train_test_split(df_valid['review'], df_valid['sentiment'], test_size=0.2, stratify=df_valid['sentiment'], random_state=2023)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=2023)

print(f"y_train\n {y_train.value_counts()}")
print(f"y_test\n {y_test.value_counts()}")
print(f"y_val\n {y_val.value_counts()}")

y_train
 1    20000
0    20000
Name: sentiment, dtype: int64
y_test
 1    2500
0    2500
Name: sentiment, dtype: int64
y_val
 1    2500
0    2500
Name: sentiment, dtype: int64


In [None]:
# Create DataLoader for training and validation sets
batch_size = 128

train_dataset = list(zip(x_train, y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = list(zip(x_val, y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = SentimentAnalysisModel(device)

# Loss function and optimizer
criterion = nn.BCELoss().to(device=device)  # Binary Cross-Entropy Loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

d_r = 10**-10
scheduler = ExponentialLR(optimizer, gamma=(1.0 - d_r))

# Define early stopping parameters
patience = 5
best_validation_accuracy = 0
no_improvement_counter = 0

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_correct_train = 0
    total_samples_train = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)

        # Compute training accuracy
        predicted_labels_train = (outputs > 0.5).int()
        total_correct_train += (predicted_labels_train.view(-1) == labels.to(device=device)).sum().item()

        # Accumulate the sum of batch sizes
        total_samples_train += len(labels)

        loss = criterion(outputs, torch.as_tensor(labels, dtype=torch.float32).unsqueeze(1).to(device=device))
        loss.backward()
        optimizer.step()

    # Compute training accuracy
    train_accuracy = total_correct_train / total_samples_train

    # Validation
    val_loss = 0.0
    total_correct_val = 0
    total_samples_val = 0
    num_iterations = (len(y_val) + batch_size - 1) // batch_size

    model.eval()
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_outputs = model(val_inputs)

            # Compute validation accuracy
            predicted_labels_val = (val_outputs > 0.5).int()
            total_correct_val += (predicted_labels_val.view(-1) == val_labels.to(device=device)).sum().item()

            # Accumulate the sum of batch sizes
            total_samples_val += len(val_labels)

            val_batch_targets = torch.as_tensor(val_labels, dtype=torch.float32).unsqueeze(1).to(device=device)
            ce = criterion(val_outputs, val_batch_targets).item()
            val_loss += ce

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = total_correct_val / total_samples_val

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_val_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Check for early stopping
        if val_accuracy > best_validation_accuracy:
            best_validation_accuracy = val_accuracy
            no_improvement_counter = 0
            # Save the trained best model if needed
            torch.save(model.state_dict(), '/content/drive/Shareddrives/test/FYP/sentiment/sentiment-analysis-imdb-all-model.pth')
        else:
            no_improvement_counter += 1

        # If no improvement for 'patience' consecutive epochs, stop training
        if no_improvement_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20, Loss: 0.3891, Training Accuracy: 0.7524, Validation Accuracy: 0.8302
Epoch 2/20, Loss: 0.3451, Training Accuracy: 0.8509, Validation Accuracy: 0.8552
Epoch 3/20, Loss: 0.3520, Training Accuracy: 0.8794, Validation Accuracy: 0.8512
