In [3]:
import torch
import random
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

### Getting the dataset from HF


In [4]:
dataset = load_dataset('amazon_polarity')

README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [5]:
dataset['train'][0]

{'label': 1,
 'title': 'Stuning even for the non-gamer',
 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}

### Taking a small part of the dataset as it contains around 4M rows

In [6]:
sample_size = 40000
train_indices = random.sample(range(len(dataset['train'])), sample_size)
small_train_dataset = dataset['train'].select(train_indices)

In [7]:
small_train_dataset[0]

{'label': 0,
 'title': 'Why?',
 'content': "Why did anyone feel the need to make this movie? It's really bad. It's the only Bruce Willis movie I don't like."}

In [8]:
sample_size = 10000
test_indices = random.sample(range(len(dataset['test'])), sample_size)
small_test_dataset = dataset['test'].select(test_indices)

In [9]:
small_test_dataset[0]

{'label': 0,
 'title': 'Beginners only',
 'content': 'This is for beginners only. There are a lot of better books than this on gin. The only redeeming value to me of this book is the description of how to play persian rummy 4 person partnership game. Also, it has a lot of dated material on the game. It scores the game 10 for knock, 20 for gin, 20 for boxes.'}

### Getting BERT tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [11]:
def tokenize_function(batch):
    return tokenizer(batch['content'], truncation=True)

### Tokenizing the dataset

In [12]:
small_train_dataset = small_train_dataset.map(tokenize_function, batched=True, remove_columns=["content", "title"])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [13]:
small_test_dataset = small_test_dataset.map(tokenize_function, batched=True, remove_columns=["content", "title"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### Padding & Data Loader

In [14]:
def collate_batch(batch):
    texts = [torch.tensor(example['input_ids']) for example in batch]
    labels = torch.tensor([example['label'] for example in batch], dtype=torch.float)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    return texts_padded, labels

In [15]:
train_loader = DataLoader(small_train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(small_test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

### LSTM Classifier

In [16]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        return torch.sigmoid(self.fc(hidden[-1]))

### Setting up of hyperparams

In [17]:
vocab_size = tokenizer.vocab_size
embedding_dim = 100
hidden_dim = 128
output_dim = 1
pad_idx = tokenizer.pad_token_id

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [19]:
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx).to(device)

In [20]:
model

LSTMClassifier(
  (embedding): Embedding(30522, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [21]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

### Training Loop

In [22]:
def train(model, iterator, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0

    for texts, labels in iterator:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts).squeeze()
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += ((predictions > 0.5).float() == labels).sum().item()

    return total_loss / len(iterator), correct / len(iterator.dataset)

### Evaluation Loop

In [23]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0

    with torch.no_grad():
        for texts, labels in iterator:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts).squeeze()
            loss = criterion(predictions, labels)

            total_loss += loss.item()
            correct += ((predictions > 0.5).float() == labels).sum().item()

    return total_loss / len(iterator), correct / len(iterator.dataset)

### Training the Model

In [24]:
n_epochs = 10
for epoch in range(n_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 01, Train Loss: 0.6932, Train Acc: 0.5031, Test Loss: 0.6925, Test Acc: 0.5112
Epoch: 02, Train Loss: 0.6935, Train Acc: 0.5057, Test Loss: 0.6927, Test Acc: 0.5106
Epoch: 03, Train Loss: 0.6931, Train Acc: 0.5074, Test Loss: 0.6921, Test Acc: 0.5117
Epoch: 04, Train Loss: 0.6925, Train Acc: 0.5064, Test Loss: 0.6878, Test Acc: 0.5432
Epoch: 05, Train Loss: 0.6862, Train Acc: 0.5364, Test Loss: 0.6844, Test Acc: 0.5235
Epoch: 06, Train Loss: 0.6812, Train Acc: 0.5475, Test Loss: 0.6685, Test Acc: 0.6042
Epoch: 07, Train Loss: 0.6380, Train Acc: 0.6433, Test Loss: 0.5507, Test Acc: 0.7427
Epoch: 08, Train Loss: 0.3953, Train Acc: 0.8264, Test Loss: 0.3535, Test Acc: 0.8497
Epoch: 09, Train Loss: 0.2740, Train Acc: 0.8913, Test Loss: 0.3225, Test Acc: 0.8649
Epoch: 10, Train Loss: 0.2044, Train Acc: 0.9252, Test Loss: 0.3492, Test Acc: 0.8631


### Accuracy on Test Set

In [25]:
print(f"Accuracy = {test_acc*100}%")

Accuracy = 86.31%


### Inference for testing on New Data

In [26]:
def preprocess(texts, tokenize, device):
  encoded_inputs = [tokenizer(text, truncation=True, padding='max_length', max_length=128) for text in texts]

  input_ids = [torch.tensor(inputs['input_ids']) for inputs in encoded_inputs]
  input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

  input_ids_padded = input_ids_padded.to(device)

  return input_ids_padded

In [27]:
model.eval()

LSTMClassifier(
  (embedding): Embedding(30522, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

### Knowingly testing on 2 opposite polarity Reviews

In [28]:
sample = ["This product is amazing! Totally recommend it.",
             "It broke after one use. Not worth the money."]

In [29]:
input_ids_padded = preprocess(sample, tokenizer, device)

In [30]:
with torch.no_grad():
    predictions = model(input_ids_padded).squeeze()
    predicted_labels = (predictions > 0.5).float()

### Testing on New Product Reviews

In [31]:
for i, text in enumerate(sample):
    sentiment = "Positive" if predicted_labels[i] == 1 else "Negative"
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

Text: This product is amazing! Totally recommend it.
Predicted Sentiment: Positive

Text: It broke after one use. Not worth the money.
Predicted Sentiment: Negative



### Saving the Model Weights

In [32]:
model_save_path = "lstm.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model weights saved to {model_save_path}")

Model weights saved to lstm.pth
