<a href="https://colab.research.google.com/github/OverCat2000/text_classification_cnn_rnn_lstm/blob/main/rnn_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim
import pathlib
import glob
import os
from random import shuffle
import pickle
from tqdm.auto import tqdm

from nltk.tokenize import TreebankWordTokenizer

import numpy as np
import pandas as pd

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2024-06-19 16:03:36--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-06-19 16:03:42 (14.0 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [17]:
model_path = '/content/drive/MyDrive/word2vec-google-news-300/word2vec-google-news-300.model'

train_path = pathlib.Path('aclImdb/train')
pos_path = train_path / 'pos'
neg_path = train_path / 'neg'

fixed_len = 600
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [18]:
# model = gensim.models.KeyedVectors.load(model_path)

In [22]:
def data(file_path):

  dataset = []
  pos_path = file_path / 'pos'
  neg_path = file_path / 'neg'

  for filename in glob.glob(os.path.join(pos_path, '*.txt')):
      with open(filename, 'r') as f:
        dataset.append((1, f.read()))

  for filename in glob.glob(os.path.join(neg_path, '*.txt')):
      with open(filename, 'r') as f:
        dataset.append((0, f.read()))

  shuffle(dataset)

  return dataset

In [23]:
dataset = data(train_path)

In [24]:
len(dataset)

25000

In [25]:
def vocabulary(dataset):
  tokenizer = TreebankWordTokenizer()

  max_len = 0
  fixed_len = 600
  tokenized_texts = []
  word2idx = {}

  word2idx['<PAD>'] = 0
  word2idx['<UNK>'] = 1

  idx = 2

  for text in dataset:
    tokenized_text = tokenizer.tokenize(text[1])
    tokenized_texts.append(tokenized_text)

    for token in tokenized_text:

      if token not in word2idx:
        word2idx[token] = idx
        idx = idx + 1

    max_len = max(max_len, len(tokenized_text))

  return max_len, word2idx, tokenized_texts

In [26]:
def encoder(fixed_len, tokenized_texts, word2idx):
  input_ids = []

  for tokenized_text in tokenized_texts:

    if len(tokenized_text) < fixed_len:

      tokenized_text += ['<PAD>']*(fixed_len - len(tokenized_text))

    elif len(tokenized_text) > fixed_len:

      tokenized_text = tokenized_text[:fixed_len]

    input_id = [word2idx.get(i) for i in tokenized_text]
    input_ids.append(input_id)

  return np.array(input_ids)

In [30]:
def load_pretrained_vectors(model_path, word2idx):
  model = gensim.models.KeyedVectors.load(model_path)
  print("model loaded")
  d = model.vector_size
  embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
  available_count = 0
  unavailabel_count = 0

  if '<PAD>' in word2idx:
    embeddings[word2idx['<PAD>']] = np.zeros(d)

  for word, idx in word2idx.items():

    try:
      pretrained_embedddings = model[word]
      available_count += 1
      embeddings[idx] = pretrained_embedddings
    except KeyError:
      unavailabel_count +=1
      pass

  print(f"availabel words: {available_count / len(word2idx) * 100}")

  return embeddings

In [31]:
def Labels(dataset):

    labels = []
    for sample in dataset:
      labels.append(sample[0])
    return np.array(labels)

In [32]:
def main(model_path, dataset):
  max_len, word2idx, tokenized_texts = vocabulary(dataset)
  input_ids = encoder(fixed_len, tokenized_texts, word2idx)

  embeddings = load_pretrained_vectors(model_path, word2idx)
  embeddings = torch.tensor(embeddings)

  labels = Labels(dataset)
  train_size = int(len(input_ids) * 0.8)

  X_train = torch.Tensor(input_ids[:train_size])
  X_val = torch.Tensor(input_ids[train_size:])

  y_train = torch.Tensor(labels[:train_size])
  y_val = torch.Tensor(labels[train_size:])

  train_dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size, True)
  val_dataloader = DataLoader(TensorDataset(X_val, y_val), batch_size, False)

  return train_dataloader, val_dataloader, embeddings

In [33]:
train_dataloader, val_dataloader, embeddings = main(model_path, dataset)

model loaded
availabel words: 51.49792462228125


In [63]:
class RNNnet(nn.Module):
  def __init__(self, hidden_units, output_shape, kernel, pretrained_embeddings):
    super().__init__()

    self.vocab_size, self.embed_dim = pretrained_embeddings.shape
    self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)

    # self.block = nn.Sequential(
    #     nn.Conv1d(self.embed_dim, hidden_units, kernel_size=kernel),
    #     nn.ReLU(),
    #     nn.MaxPool1d(2)
    # )

    self.rnn = nn.RNN(input_size=self.embed_dim, hidden_size=hidden_units, batch_first=True, bidirectional=True)



    self.flat = nn.Flatten(start_dim=1)
    self.fc = nn.Linear(in_features=hidden_units*1200, out_features=output_shape)
    #self.sigmoid = nn.Sigmoid()


  def forward(self, x):
    x = self.embedding(x.int()).float()
    x, _ = self.rnn(x)
    x = self.flat(x)
    x = self.fc(x)
    #x = self.sigmoid(x)
    return x

In [64]:
class BinaryAccuracy:
    def __init__(self, threshold=0.5):
        self.threshold = threshold

    def __call__(self, logits, targets):
        # Apply sigmoid to logits to get probabilities
        probabilities = torch.sigmoid(logits).squeeze(dim=1)
        # Convert probabilities to binary predictions
        predictions = (probabilities >= self.threshold).float()
        # Compare predictions with targets and calculate accuracy
        correct = (predictions == targets).float().sum()
        accuracy = correct / targets.numel()
        return accuracy.item()

In [68]:
rnn = RNNnet(filters, 1, kernel_size, embeddings)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=rnn.parameters(), lr=0.001)
accuracy_fn = BinaryAccuracy()
rnn.to(device)

RNNnet(
  (embedding): Embedding(150575, 300)
  (rnn): RNN(300, 250, batch_first=True, bidirectional=True)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=300000, out_features=1, bias=True)
)

In [69]:
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer):
    # Put model in train mode
    model.train()

    # Setup train loss and train accuracy values
    train_loss, train_acc = 0, 0

    # Loop through data loader data batches
    for batch, (X, y) in enumerate(dataloader):
        # 1. Forward pass
        X = X.to(device)
        y = y.to(device)
        y_pred = model(X)



        # 2. Calculate  and accumulate loss
        loss = loss_fn(y_pred.squeeze(1), y)
        train_loss += loss.item()

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Calculate and accumulate accuracy metric across all batches
        train_acc += accuracy_fn(y_pred, y)

    # Adjust metrics to get average loss and accuracy per batch
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc

In [70]:
def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module):
    # Put model in eval mode
    model.eval()

    # Setup test loss and test accuracy values
    test_loss, test_acc = 0, 0

    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            # 1. Forward pass
            test_pred_logits = model(X)

            # 2. Calculate and accumulate loss
            loss = loss_fn(test_pred_logits.squeeze(1), y)
            test_loss += loss.item()

            # Calculate and accumulate accuracy
            test_acc += accuracy_fn(test_pred_logits, y)

    # Adjust metrics to get average loss and accuracy per batch
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    return test_loss, test_acc

In [71]:
def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int = 5):

    # 2. Create empty results dictionary
    results = {"train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": []
    }

    # 3. Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                           dataloader=train_dataloader,
                                           loss_fn=loss_fn,
                                           optimizer=optimizer)
        test_loss, test_acc = test_step(model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn)

        # 4. Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        # 5. Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    # 6. Return the filled results at the end of the epochs
    return results

In [73]:
# Set random seeds
torch.manual_seed(42)

# Set number of epochs
NUM_EPOCHS = 100

# Start the timer
from timeit import default_timer as timer
start_time = timer()

# Train model_0
model_0_results = train(model=rnn,
                        train_dataloader=train_dataloader,
                        test_dataloader=val_dataloader,
                        optimizer=optimizer,
                        loss_fn=loss_fn,
                        epochs=NUM_EPOCHS)

# End the timer and print out how long it took
end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 17.7189 | train_acc: 0.6024 | test_loss: 31.8744 | test_acc: 0.4865
Epoch: 2 | train_loss: 15.1633 | train_acc: 0.5024 | test_loss: 7.6943 | test_acc: 0.5026
Epoch: 3 | train_loss: 12.4173 | train_acc: 0.5278 | test_loss: 6.4787 | test_acc: 0.5352
Epoch: 4 | train_loss: 7.5783 | train_acc: 0.5939 | test_loss: 1.8110 | test_acc: 0.7532
Epoch: 5 | train_loss: 5.3940 | train_acc: 0.6508 | test_loss: 2.1120 | test_acc: 0.6943
Epoch: 6 | train_loss: 4.5964 | train_acc: 0.6707 | test_loss: 2.1623 | test_acc: 0.7600
Epoch: 7 | train_loss: 5.9877 | train_acc: 0.6691 | test_loss: 2.3910 | test_acc: 0.7781
Epoch: 8 | train_loss: 3.8154 | train_acc: 0.7155 | test_loss: 1.7865 | test_acc: 0.8067
Epoch: 9 | train_loss: 3.7378 | train_acc: 0.7253 | test_loss: 1.7411 | test_acc: 0.8135
Epoch: 10 | train_loss: 4.6935 | train_acc: 0.7073 | test_loss: 3.2431 | test_acc: 0.6885
Epoch: 11 | train_loss: 4.7368 | train_acc: 0.7161 | test_loss: 5.4152 | test_acc: 0.6447
Epoch: 12 | tra

In [79]:
def make_predictions(model, data):

    pred_class = []

    model.eval()

    with torch.inference_mode():
        for sample in data:
            sample = torch.unsqueeze(sample, dim=0)
            sample = sample.to(device)

            pred_logits = model(sample)

            pred_probs = torch.sigmoid(pred_logits)

            if pred_probs >= 0.5:
                pred_class.append(1)
            else:
                pred_class.append(0)

    return pred_class

In [89]:
sample_labels = next(iter(val_dataloader))[1]
sample_features = next(iter(val_dataloader))[0]

pred = make_predictions(rnn, sample_features)

print(pred), print(sample_labels.int().tolist())

[0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0]
[0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0]


(None, None)