<a href="https://colab.research.google.com/github/OverCat2000/text_classification_cnn_rnn_lstm/blob/main/lstm_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim
import pathlib
import glob
import os
from random import shuffle
import pickle
from tqdm.auto import tqdm

from nltk.tokenize import TreebankWordTokenizer

import numpy as np
import pandas as pd

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

In [16]:
!rm -r word2vec-google-news-300
!git lfs install
!git clone https://huggingface.co/fse/word2vec-google-news-300

Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'word2vec-google-news-300'...
remote: Enumerating objects: 11, done.[K
remote: Total 11 (delta 0), reused 0 (delta 0), pack-reused 11 (from 1)[K
Unpacking objects: 100% (11/11), 1.51 KiB | 773.00 KiB/s, done.
Filtering content: 100% (2/2), 3.52 GiB | 17.49 MiB/s, done.


In [18]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2024-06-21 07:41:05--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: 'aclImdb_v1.tar.gz.1'


2024-06-21 07:41:23 (4.53 MB/s) - 'aclImdb_v1.tar.gz.1' saved [84125825/84125825]


gzip: stdin: unexpected end of file
tar: Unexpected EOF in archive
tar: Unexpected EOF in archive
tar: Error is not recoverable: exiting now


In [19]:
model_path = '/kaggle/working/word2vec-google-news-300/word2vec-google-news-300.model'

train_path = pathlib.Path('aclImdb/train')
pos_path = train_path / 'pos'
neg_path = train_path / 'neg'

fixed_len = 600
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [20]:
# model = gensim.models.KeyedVectors.load(model_path)

In [21]:
def data(file_path):

  dataset = []
  pos_path = file_path / 'pos'
  neg_path = file_path / 'neg'

  for filename in glob.glob(os.path.join(pos_path, '*.txt')):
      with open(filename, 'r') as f:
        dataset.append((1, f.read()))

  for filename in glob.glob(os.path.join(neg_path, '*.txt')):
      with open(filename, 'r') as f:
        dataset.append((0, f.read()))

  shuffle(dataset)

  return dataset

In [54]:
dataset = data(train_path)

In [55]:
len(dataset)

25000

In [70]:
def tokenizer(text):
    tokenizer = TreebankWordTokenizer()
    return tokenizer.tokenize(text)

In [71]:
def vocabulary(dataset):
  #tokenizer = TreebankWordTokenizer()

  max_len = 0
  fixed_len = 600
  tokenized_texts = []
  word2idx = {}

  word2idx['<PAD>'] = 0
  word2idx['<UNK>'] = 1

  idx = 2

  for text in dataset:
    tokenized_text = tokenizer(text[1])
    tokenized_texts.append(tokenized_text)

    for token in tokenized_text:

      if token not in word2idx:
        word2idx[token] = idx
        idx = idx + 1

    max_len = max(max_len, len(tokenized_text))

  return max_len, word2idx, tokenized_texts

In [72]:
def encoder(fixed_len, tokenized_texts, word2idx):
  input_ids = []

  for tokenized_text in tokenized_texts:

    if len(tokenized_text) < fixed_len:

      tokenized_text += ['<PAD>']*(fixed_len - len(tokenized_text))

    elif len(tokenized_text) > fixed_len:

      tokenized_text = tokenized_text[:fixed_len]

    input_id = [word2idx.get(i) for i in tokenized_text]
    input_ids.append(input_id)

  return np.array(input_ids)

In [73]:
def load_pretrained_vectors(model_path, word2idx):
  model = gensim.models.KeyedVectors.load(model_path)
  print("model loaded")
  d = model.vector_size
  embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
  available_count = 0
  unavailabel_count = 0

  if '<PAD>' in word2idx:
    embeddings[word2idx['<PAD>']] = np.zeros(d)

  for word, idx in word2idx.items():

    try:
      pretrained_embedddings = model[word]
      available_count += 1
      embeddings[idx] = pretrained_embedddings
    except KeyError:
      unavailabel_count +=1
      pass

  print(f"availabel words: {available_count / len(word2idx) * 100}")

  return embeddings

In [74]:
def Labels(dataset):

    labels = []
    for sample in dataset:
      labels.append(sample[0])
    return np.array(labels)

In [75]:
def main(model_path, dataset):
  max_len, word2idx, tokenized_texts = vocabulary(dataset)
  input_ids = encoder(fixed_len, tokenized_texts, word2idx)

  embeddings = load_pretrained_vectors(model_path, word2idx)
  embeddings = torch.tensor(embeddings)

  labels = Labels(dataset)
  train_size = int(len(input_ids) * 0.8)

  X_train = torch.Tensor(input_ids[:train_size])
  X_val = torch.Tensor(input_ids[train_size:])

  y_train = torch.Tensor(labels[:train_size])
  y_val = torch.Tensor(labels[train_size:])

  train_dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size, True)
  val_dataloader = DataLoader(TensorDataset(X_val, y_val), batch_size, False)

  return train_dataloader, val_dataloader, embeddings, word2idx

In [76]:
train_dataloader, val_dataloader, embeddings, vocab = main(model_path, dataset)

model loaded
availabel words: 51.5337586390218


In [30]:
class LSTMnet(nn.Module):
  def __init__(self, hidden_units, output_shape, kernel, pretrained_embeddings):
    super().__init__()

    self.vocab_size, self.embed_dim = pretrained_embeddings.shape
    self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)

    # self.block = nn.Sequential(
    #     nn.Conv1d(self.embed_dim, hidden_units, kernel_size=kernel),
    #     nn.ReLU(),
    #     nn.MaxPool1d(2)
    # )

    #self.rnn = nn.RNN(input_size=self.embed_dim, hidden_size=hidden_units, batch_first=True, bidirectional=True)
    self.lstm = nn.LSTM(input_size=self.embed_dim, hidden_size=hidden_units, batch_first=True, bidirectional=True)
    
    self.drop = nn.Dropout(0.5)



    self.flat = nn.Flatten(start_dim=1)
    self.fc = nn.Linear(in_features=hidden_units*1200, out_features=output_shape)
    #self.sigmoid = nn.Sigmoid()


  def forward(self, x):
    x = self.embedding(x.int()).float()
    x, _ = self.lstm(x)
    x = self.drop(x)
    x = self.flat(x)
    x = self.fc(x)
    #x = self.sigmoid(x)
    return x

In [31]:
class BinaryAccuracy:
    def __init__(self, threshold=0.5):
        self.threshold = threshold

    def __call__(self, logits, targets):
        # Apply sigmoid to logits to get probabilities
        probabilities = torch.sigmoid(logits).squeeze(dim=1)
        # Convert probabilities to binary predictions
        predictions = (probabilities >= self.threshold).float()
        # Compare predictions with targets and calculate accuracy
        correct = (predictions == targets).float().sum()
        accuracy = correct / targets.numel()
        return accuracy.item()

In [32]:
lstm = LSTMnet(filters, 1, kernel_size, embeddings)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=lstm.parameters(), lr=0.001)
accuracy_fn = BinaryAccuracy()
lstm.to(device)

LSTMnet(
  (embedding): Embedding(150480, 300)
  (lstm): LSTM(300, 250, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.5, inplace=False)
  (flat): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=300000, out_features=1, bias=True)
)

In [33]:
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer):
    # Put model in train mode
    model.train()

    # Setup train loss and train accuracy values
    train_loss, train_acc = 0, 0

    # Loop through data loader data batches
    for batch, (X, y) in enumerate(dataloader):
        # 1. Forward pass
        X = X.to(device)
        y = y.to(device)
        y_pred = model(X)



        # 2. Calculate  and accumulate loss
        loss = loss_fn(y_pred.squeeze(1), y)
        train_loss += loss.item()

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Calculate and accumulate accuracy metric across all batches
        train_acc += accuracy_fn(y_pred, y)

    # Adjust metrics to get average loss and accuracy per batch
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc

In [34]:
def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module):
    # Put model in eval mode
    model.eval()

    # Setup test loss and test accuracy values
    test_loss, test_acc = 0, 0

    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for batch, (X, y) in enumerate(dataloader):
            X = X.to(device)
            y = y.to(device)
            # 1. Forward pass
            test_pred_logits = model(X)

            # 2. Calculate and accumulate loss
            loss = loss_fn(test_pred_logits.squeeze(1), y)
            test_loss += loss.item()

            # Calculate and accumulate accuracy
            test_acc += accuracy_fn(test_pred_logits, y)

    # Adjust metrics to get average loss and accuracy per batch
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    return test_loss, test_acc

In [35]:
def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int = 5):

    # 2. Create empty results dictionary
    results = {"train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": []
    }

    # 3. Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                           dataloader=train_dataloader,
                                           loss_fn=loss_fn,
                                           optimizer=optimizer)
        test_loss, test_acc = test_step(model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn)

        # 4. Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        # 5. Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    # 6. Return the filled results at the end of the epochs
    return results

In [103]:
# Set random seeds
torch.manual_seed(42)

# Set number of epochs
NUM_EPOCHS = 10

# Start the timer
from timeit import default_timer as timer
start_time = timer()

# Train model_0
model_0_results = train(model=lstm,
                        train_dataloader=train_dataloader,
                        test_dataloader=val_dataloader,
                        optimizer=optimizer,
                        loss_fn=loss_fn,
                        epochs=NUM_EPOCHS)

# End the timer and print out how long it took
end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.4615 | train_acc: 0.7905 | test_loss: 0.3235 | test_acc: 0.8666
Epoch: 2 | train_loss: 0.0600 | train_acc: 0.9806 | test_loss: 0.3752 | test_acc: 0.8692
Epoch: 3 | train_loss: 0.0076 | train_acc: 0.9988 | test_loss: 0.4801 | test_acc: 0.8718
Epoch: 4 | train_loss: 0.0018 | train_acc: 0.9999 | test_loss: 0.4622 | test_acc: 0.8746
Epoch: 5 | train_loss: 0.0008 | train_acc: 0.9999 | test_loss: 0.5364 | test_acc: 0.8762
Epoch: 6 | train_loss: 0.0004 | train_acc: 1.0000 | test_loss: 0.7447 | test_acc: 0.8712
Epoch: 7 | train_loss: 0.0001 | train_acc: 1.0000 | test_loss: 0.6899 | test_acc: 0.8786


KeyboardInterrupt: 

In [104]:
def make_predictions(model, data):

    pred_class = []

    model.eval()

    with torch.inference_mode():
        for sample in data:
            sample = torch.unsqueeze(sample, dim=0)
            sample = sample.to(device)

            pred_logits = model(sample)

            pred_probs = torch.sigmoid(pred_logits)

            if pred_probs >= 0.5:
                pred_class.append(1)
            else:
                pred_class.append(0)

    return pred_class

In [105]:
sample_labels = next(iter(val_dataloader))[1]
sample_features = next(iter(val_dataloader))[0]

pred = make_predictions(lstm, sample_features)

print(pred), print(sample_labels.int().tolist())

[1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1]
[1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1]


(None, None)

In [140]:
sample = [(0, "the movies was about a real life amazing story about a person.it was amazing")]

sample_tokens = [tokenizer(i[1]) for i in sample]
#print(sample_tokens)

data = np.array(encoder(600, sample_tokens, vocab), dtype="float32")

data = torch.Tensor(data)

make_predictions(lstm, data)

[1]