<a href="https://colab.research.google.com/github/Parkar0707/Sentiment-Analysis-using-RNN/blob/main/Sentiment_Analysis_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Model Using RNN

<h4> I'm using IMDB Dataset Created by Sebastian raschka</h4>

<a href="https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz">https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz</a>

In [None]:
!pip install datasets



In [None]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer
from datasets import load_dataset, DatasetDict, ClassLabel
import time
import random
import pandas as pd

In [None]:
!wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz

--2024-06-07 16:03:01--  https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2024-06-07 16:03:02--  https://raw.githubusercontent.com/rasbt/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘movie_data.csv.gz.1’


2024-06-07 16:03:02 (141 MB/s) - ‘movie_data.csv.gz.1’ saved [26521894/26521894

In [None]:
!gunzip -f movie_data.csv.gz

In [None]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

df = pd.read_csv('movie_data.csv')
df.columns = ['TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME']
df.to_csv('movie_data.csv', index=None)

# Load dataset
raw_datasets = load_dataset('csv', data_files='movie_data.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['TEXT_COLUMN_NAME'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.rename_column("LABEL_COLUMN_NAME", "labels")
tokenized_datasets = tokenized_datasets.cast_column("labels", ClassLabel(num_classes=NUM_CLASSES,
                                                                         names=["negative", "positive"]))

# Split train, validation, test
train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.2, seed=RANDOM_SEED)
train_valid_split = train_test_split['train'].train_test_split(test_size=0.15, seed=RANDOM_SEED)

dataset_dict = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': train_test_split['test']
})

Casting the dataset:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
def collate_fn(batch):
    texts = [item['input_ids'] for item in batch]
    lengths = [len(text) for text in texts]
    max_length = max(lengths)
    texts_padded = [text + [0] * (max_length - len(text)) for text in texts]  # Padding
    labels = [item['labels'] for item in batch]

    return {
        'input_ids': torch.tensor(texts_padded, dtype=torch.long),
        'lengths': torch.tensor(lengths, dtype=torch.long),
        'labels': torch.tensor(labels, dtype=torch.long)
    }


train_loader = torch.utils.data.DataLoader(dataset_dict['train'],
                                           batch_size=BATCH_SIZE, shuffle=True,
                                           collate_fn=collate_fn)

valid_loader = torch.utils.data.DataLoader(dataset_dict['validation'],
                                           batch_size=BATCH_SIZE, shuffle=False,
                                           collate_fn=collate_fn)

test_loader = torch.utils.data.DataLoader(dataset_dict['test'], batch_size=BATCH_SIZE,
                                          shuffle=False,
                                          collate_fn=collate_fn)

print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch["input_ids"].size()}')
    print(f'Target vector size: {batch["labels"].size()}')
    break

print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch["input_ids"].size()}')
    print(f'Target vector size: {batch["labels"].size()}')
    break

print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch["input_ids"].size()}')
    print(f'Target vector size: {batch["labels"].size()}')
    break

Train
Text matrix size: torch.Size([128, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([128, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([128, 128])
Target vector size: torch.Size([128])


In [None]:
class RNN(torch.nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,
                                                                  batch_first=True, enforce_sorted=False)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return output

In [None]:
torch.manual_seed(RANDOM_SEED)

input_dim = len(tokenizer.vocab)

model = RNN(input_dim=input_dim,
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES
)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [None]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct_pred, num_examples = 0, 0

    with torch.no_grad():
        for batch_data in data_loader:
            text = batch_data['input_ids']
            lengths = batch_data['lengths']
            labels = batch_data['labels']

            logits = model(text, lengths)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()

    return correct_pred.float() / num_examples * 100

In [None]:

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):

        text = batch_data['input_ids']
        lengths = batch_data['lengths']
        labels = batch_data['labels']

        logits = model(text, lengths)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader):.2f}%')

Epoch: 001/010 | Batch 000/266 | Loss: 0.6934
Epoch: 001/010 | Batch 050/266 | Loss: 0.6920


In [None]:
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader):.2f}%')

Total Training Time: 36.77 min
Test accuracy: 78.49%


In [None]:
def predict_sentiment(model, sentence):
    model.eval()

    # tokenize sentence
    inputs = tokenizer(sentence, return_tensors='pt',
                       padding='max_length', truncation=True,
                       max_length=128)

    inputs = {key: value for key, value in inputs.items()}

    lengths = torch.tensor([inputs['input_ids'].size(1)], dtype=torch.long)

    with torch.no_grad():
        logits = model(inputs['input_ids'], lengths)
        prediction = torch.nn.functional.softmax(logits, dim=1)

    return prediction[0][1].item()

print('Probability for positive:')
print(predict_sentiment(model, "Its good!"))


Probability positive:
0.7435232996940613
