In [2]:
!pip install 'portalocker>=2.0.0'
!pip install 'pytorch-lightning'

Collecting portalocker>=2.0.0
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.10.0 pytorch-lightning-2.1.2 torchmetrics-1.2.0


In [88]:
import torch
from tqdm import tqdm
import torchtext.datasets as datasets
from collections import Counter

In [89]:
train = datasets.IMDB('./', split="train")
test = datasets.IMDB('./', split="test")

In [90]:
train_data = []
for label, text in train:
  train_data.append((label, text))
print("Label Counts for Train Data: ", Counter([_[0] for _ in train_data]))

test_data = []
for label, text in test:
  test_data.append((label, text))
print("Label Counts for Train Data: ", Counter([_[0] for _ in test_data]))

Label Counts for Train Data:  Counter({1: 12500, 2: 12500})
Label Counts for Train Data:  Counter({1: 12500, 2: 12500})


In [91]:
train_data[0]

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

### Bag-of-Words Features Baseline Model

#### Let's preprocess the data first


In [92]:
# We want to remove all html tags
# Remove punctuation
# remove all the stopwords
####
# apply stemming on the words
# OR
# apply lemmatization on the words

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def remove_punctuation(text):
    new_text = []
    for i in text:
      if i not in string.punctuation and i not in string.digits:
        new_text.append(i)
      else:
        new_text.append(" ")
    return "".join(new_text)

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text).lower().strip()

def remove_stopwords(text):
    all_stop_words = set(stopwords.words('english'))
    new_text = []
    for word in re.split("\W+", text):
      if word not in all_stop_words:
        new_text.append(word)
    return new_text

def word_stemming(words):
    return [porter_stemmer.stem(word) for word in words]

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(word) for word in words]

def convert_to_paragraph(words):
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [93]:
convert_to_paragraph(
    word_lemmatizer(
            remove_stopwords(
                remove_punctuation(
                    remove_html_tags(
                        train_data[0][1])))))

'rented curious yellow video store controversy surrounded first released also heard first seized u custom ever tried enter country therefore fan film considered controversial really see plot centered around young swedish drama student named lena want learn everything life particular want focus attention making sort documentary average swede thought certain political issue vietnam war race issue united state asking politician ordinary denizen stockholm opinion politics sex drama teacher classmate married men kill curious yellow year ago considered pornographic really sex nudity scene far even shot like cheaply made porno countryman mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scene film commend filmmaker fact sex shown film shown artistic purpose rather shock people make money shown pornographic theater america curious yellow good film anyone wanting study meat potato pun intended swedish cinema really f

In [94]:
train_data[0][1]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [95]:
TRAIN_DATA_SIZE = len(train_data)
for i in tqdm(range(TRAIN_DATA_SIZE)):
  label, text = train_data[i]
  processed_text = convert_to_paragraph(
    word_lemmatizer(
            remove_stopwords(
                remove_punctuation(
                    remove_html_tags(
                        text)))))
  train_data[i] = (label, processed_text)


  0%|          | 0/25000 [00:00<?, ?it/s][A
  0%|          | 93/25000 [00:00<00:27, 903.08it/s][A
  1%|          | 184/25000 [00:00<00:35, 707.73it/s][A
  1%|          | 258/25000 [00:00<00:37, 658.64it/s][A
  1%|▏         | 326/25000 [00:00<00:39, 624.48it/s][A
  2%|▏         | 390/25000 [00:00<00:39, 626.55it/s][A
  2%|▏         | 454/25000 [00:00<00:40, 602.59it/s][A
  2%|▏         | 515/25000 [00:00<00:43, 556.95it/s][A
  2%|▏         | 572/25000 [00:00<00:45, 539.79it/s][A
  3%|▎         | 627/25000 [00:01<00:45, 529.91it/s][A
  3%|▎         | 681/25000 [00:01<00:47, 509.54it/s][A
  3%|▎         | 733/25000 [00:01<00:47, 510.23it/s][A
  3%|▎         | 785/25000 [00:01<00:49, 487.54it/s][A
  3%|▎         | 843/25000 [00:01<00:47, 512.42it/s][A
  4%|▎         | 906/25000 [00:01<00:44, 544.25it/s][A
  4%|▍         | 961/25000 [00:01<00:44, 545.46it/s][A
  4%|▍         | 1021/25000 [00:01<00:42, 558.37it/s][A
  4%|▍         | 1088/25000 [00:01<00:40, 590.03it/s][A
 

In [96]:
TEST_DATA_SIZE = len(test_data)
for i in tqdm(range(TEST_DATA_SIZE)):
  label, text = test_data[i]
  processed_text = convert_to_paragraph(
    word_lemmatizer(
        word_stemming(
            remove_stopwords(
                remove_punctuation(
                    remove_html_tags(
                        text))))))
  test_data[i] = (label, processed_text)


  0%|          | 0/25000 [00:00<?, ?it/s][A
  0%|          | 21/25000 [00:00<02:01, 206.24it/s][A
  0%|          | 42/25000 [00:00<02:14, 185.40it/s][A
  0%|          | 61/25000 [00:00<02:32, 163.07it/s][A
  0%|          | 78/25000 [00:00<02:35, 160.13it/s][A
  0%|          | 95/25000 [00:00<02:51, 145.12it/s][A
  0%|          | 113/25000 [00:00<02:41, 153.90it/s][A
  1%|          | 129/25000 [00:00<02:47, 148.70it/s][A
  1%|          | 145/25000 [00:00<02:50, 145.59it/s][A
  1%|          | 168/25000 [00:01<02:28, 167.03it/s][A
  1%|          | 190/25000 [00:01<02:17, 180.23it/s][A
  1%|          | 211/25000 [00:01<02:12, 187.48it/s][A
  1%|          | 230/25000 [00:01<02:15, 182.38it/s][A
  1%|          | 251/25000 [00:01<02:11, 188.05it/s][A
  1%|          | 281/25000 [00:01<01:52, 219.51it/s][A
  1%|▏         | 321/25000 [00:01<01:30, 271.47it/s][A
  1%|▏         | 354/25000 [00:01<01:26, 285.09it/s][A
  2%|▏         | 385/25000 [00:01<01:25, 288.51it/s][A
  2%|▏ 

### Now that we have our corpus ready we can prepare the training features and the Label
#### As we discussed Earlier in the chapter we will use a TF-IDF Vectorizer

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [98]:
tfidf = TfidfVectorizer(ngram_range=(1, 1), max_features=1000)

In [99]:
train_features_text = [sample[1] for sample in train_data]
test_features_text = [sample[1] for sample in test_data]

tfidf.fit(train_features_text)

In [100]:
train_features = tfidf.transform(train_features_text)
test_features = tfidf.transform(test_features_text)

In [101]:
ohe_enc = OneHotEncoder()

In [102]:
train_label_raw = [[sample[0]] for sample in train_data]
test_label_raw = [[sample[0]] for sample in test_data]

ohe_enc.fit(train_label_raw + test_label_raw)

In [103]:
train_label = ohe_enc.transform(train_label_raw)
test_label = ohe_enc.transform(test_label_raw)

In [104]:
train_features.shape, train_label.shape

((25000, 1000), (25000, 2))

### Words Associated to each Feature Index

In [105]:
tfidf.get_feature_names_out()

array(['ability', 'able', 'absolutely', 'accent', 'across', 'act',
       'acted', 'acting', 'action', 'actor', 'actress', 'actual',
       'actually', 'adaptation', 'add', 'admit', 'adult', 'adventure',
       'age', 'ago', 'agree', 'air', 'alien', 'almost', 'alone', 'along',
       'already', 'also', 'although', 'always', 'amazing', 'america',
       'american', 'among', 'amount', 'amusing', 'animal', 'animated',
       'animation', 'annoying', 'another', 'answer', 'anyone', 'anything',
       'anyway', 'apart', 'apparently', 'appeal', 'appear', 'appearance',
       'appears', 'appreciate', 'around', 'art', 'artist', 'ask',
       'aspect', 'atmosphere', 'attack', 'attempt', 'attention',
       'audience', 'average', 'avoid', 'award', 'away', 'awful', 'baby',
       'back', 'background', 'bad', 'badly', 'band', 'based', 'basic',
       'basically', 'battle', 'beautiful', 'beauty', 'became', 'become',
       'becomes', 'begin', 'beginning', 'behind', 'belief', 'believable',
       'be

# Simple Feed Forward Neural Network Model

In [113]:
import numpy as np
import torch
import torch.nn as nn
import torchmetrics
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import pytorch_lightning as pl

class Net(pl.LightningModule):

    def __init__(self):
        super(Net, self).__init__()
        self.criterion = torch.nn.BCELoss()
        self.train_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=2)
        self.valid_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=2)
        self.fc1 = nn.Linear(1000, 120)  # input dimension with 2000 words/tokens
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        accuracy = self.train_acc(logits.round(), y)
        self.log('train_loss', loss)
        self.log('train_acc', accuracy, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        accuracy = self.valid_acc(logits.round(), y)
        self.log('val_loss', loss)
        self.log('val_acc', accuracy, on_epoch=True)

    def configure_optimizers(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        return opt

net = Net()
print(net)

Net(
  (criterion): BCELoss()
  (train_acc): MulticlassAccuracy()
  (valid_acc): MulticlassAccuracy()
  (fc1): Linear(in_features=1000, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=2, bias=True)
)


In [107]:
train_features_tensor = torch.from_numpy(train_features.todense().astype('float32'))
train_labels_tensor = torch.from_numpy(train_label.todense().astype('float32'))

In [108]:
test_features_tensor = torch.from_numpy(test_features.todense().astype('float32'))
test_labels_tensor = torch.from_numpy(test_label.todense().astype('float32'))

In [109]:
class CustomDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

  def __len__(self):
    return len(self.X)

train_dataset = CustomDataset(train_features_tensor, train_labels_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

test_dataset = CustomDataset(test_features_tensor, test_labels_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [114]:
trainer = pl.Trainer(max_epochs=5)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [115]:
trainer.validate(net, test_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.6937711834907532, 'val_acc': 0.5}]

In [116]:
trainer.fit(net, train_dataloader)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type               | Params
-------------------------------------------------
0 | criterion | BCELoss            | 0     
1 | train_acc | MulticlassAccuracy | 0     
2 | valid_acc | MulticlassAccuracy | 0     
3 | fc1       | Linear             | 120 K 
4 | fc2       | Linear             | 10.2 K
5 | fc3       | Linear             | 170   
-------------------------------------------------
130 K     Trainable params
0         Non-trainable params
130 K     Total params
0.522     Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers whi

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [117]:
trainer.validate(net, test_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.4267631471157074, 'val_acc': 0.8017399907112122}]

## Results:
- Accuracy of 85.60%
- Bag of Words Features

# LSTM Model

In [57]:
# Preprocessing
train = datasets.IMDB('./', split="train")
test = datasets.IMDB('./', split="test")

from collections import Counter

train_data = []
for label, text in train:
  train_data.append((label, text))
print("Label Counts for Train Data: ", Counter([_[0] for _ in train_data]))

test_data = []
for label, text in test:
  test_data.append((label, text))
print("Label Counts for Train Data: ", Counter([_[0] for _ in test_data]))

Label Counts for Train Data:  Counter({1: 12500, 2: 12500})
Label Counts for Train Data:  Counter({1: 12500, 2: 12500})


In [59]:
# We want to remove all html tags
# Remove punctuation
# remove all the stopwords
# apply stemming on the words
# apply lemmatization on the words

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

MAX_WORDS_PER_REVIEW = 500

def remove_punctuation(text):
    return "".join([i for i in text if i not in string.punctuation and i not in string.digits])

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text).lower().strip()

def word_stemming(words):
    if type(words) == str:
      words = words.split()
    return [porter_stemmer.stem(word) for word in words]

def word_lemmatizer(words):
    if type(words) == str:
      words = words.split()
    return [lemmatizer.lemmatize(word) for word in words]

def padding(words):
    if type(words) == str:
      words = words.split()

    if len(words) < MAX_WORDS_PER_REVIEW:
      words.extend(["<PAD>"]*(MAX_WORDS_PER_REVIEW - len(words)))
    return " ".join(words[:MAX_WORDS_PER_REVIEW])

def convert_to_paragraph(words):
    return padding(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
TRAIN_DATA_SIZE = len(train_data)
for i in tqdm(range(TRAIN_DATA_SIZE)):
  label, text = train_data[i]
  #processed_text = convert_to_paragraph(remove_html_tags(text))
  processed_text = convert_to_paragraph(word_lemmatizer(remove_punctuation(remove_html_tags(text))))
  train_data[i] = (label, processed_text)


  0%|          | 0/25000 [00:00<?, ?it/s][A
  0%|          | 98/25000 [00:00<00:25, 975.07it/s][A
  1%|          | 212/25000 [00:00<00:23, 1071.32it/s][A
  1%|▏         | 320/25000 [00:00<00:23, 1071.60it/s][A
  2%|▏         | 428/25000 [00:00<00:23, 1052.55it/s][A
  2%|▏         | 534/25000 [00:00<00:26, 940.17it/s] [A
  3%|▎         | 630/25000 [00:00<00:26, 930.46it/s][A
  3%|▎         | 725/25000 [00:00<00:26, 916.63it/s][A
  3%|▎         | 818/25000 [00:00<00:26, 904.59it/s][A
  4%|▎         | 927/25000 [00:00<00:25, 954.61it/s][A
  4%|▍         | 1032/25000 [00:01<00:24, 977.70it/s][A
  5%|▍         | 1152/25000 [00:01<00:22, 1040.88it/s][A
  5%|▌         | 1260/25000 [00:01<00:22, 1049.88it/s][A
  5%|▌         | 1370/25000 [00:01<00:22, 1064.12it/s][A
  6%|▌         | 1477/25000 [00:01<00:23, 988.11it/s] [A
  6%|▋         | 1578/25000 [00:01<00:28, 822.95it/s][A
  7%|▋         | 1666/25000 [00:01<00:30, 752.83it/s][A
  7%|▋         | 1746/25000 [00:01<00:33, 69

In [61]:
TEST_DATA_SIZE = len(test_data)
for i in tqdm(range(TEST_DATA_SIZE)):
  label, text = test_data[i]
  #processed_text = convert_to_paragraph(remove_html_tags(text))
  processed_text = convert_to_paragraph(word_lemmatizer(remove_punctuation(remove_html_tags(text))))
  test_data[i] = (label, processed_text)


  0%|          | 0/25000 [00:00<?, ?it/s][A
  0%|          | 57/25000 [00:00<00:45, 554.17it/s][A
  0%|          | 113/25000 [00:00<00:49, 502.28it/s][A
  1%|          | 164/25000 [00:00<00:51, 479.79it/s][A
  1%|          | 220/25000 [00:00<00:49, 502.85it/s][A
  1%|          | 271/25000 [00:00<00:49, 503.43it/s][A
  1%|▏         | 334/25000 [00:00<00:45, 539.98it/s][A
  2%|▏         | 389/25000 [00:00<00:48, 504.29it/s][A
  2%|▏         | 440/25000 [00:00<00:50, 482.25it/s][A
  2%|▏         | 490/25000 [00:00<00:50, 486.72it/s][A
  2%|▏         | 541/25000 [00:01<00:49, 492.90it/s][A
  2%|▏         | 595/25000 [00:01<00:48, 506.21it/s][A
  3%|▎         | 660/25000 [00:01<00:44, 545.79it/s][A
  3%|▎         | 715/25000 [00:01<00:45, 528.04it/s][A
  3%|▎         | 776/25000 [00:01<00:44, 549.55it/s][A
  3%|▎         | 832/25000 [00:01<00:46, 520.76it/s][A
  4%|▎         | 888/25000 [00:01<00:45, 531.13it/s][A
  4%|▍         | 943/25000 [00:01<00:45, 532.22it/s][A
  4

In [62]:
train_data[-100]

(2,
 'i cant remember many film where a bumbling idiot of a hero wa so funny throughout leslie cheung is such the antithesis of a hero that he too dense to be seduced by a gorgeous vampire i had the good luck to see it on a big screen and to find a video to watch again and again <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [63]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [64]:
tokenizer = get_tokenizer("basic_english")

def build_vocab(ds):
  for _, text in ds:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocab(train_data + test_data), min_freq=1, specials=['<UNK>'])
vocab.set_default_index(vocab["<UNK>"])

In [65]:
tokens = tokenizer(train_data[-100][1])
indexes = vocab(tokens)

print(tokens)
print(indexes)
print(len(tokens), len(indexes))

['i', 'cant', 'remember', 'many', 'film', 'where', 'a', 'bumbling', 'idiot', 'of', 'a', 'hero', 'wa', 'so', 'funny', 'throughout', 'leslie', 'cheung', 'is', 'such', 'the', 'antithesis', 'of', 'a', 'hero', 'that', 'he', 'too', 'dense', 'to', 'be', 'seduced', 'by', 'a', 'gorgeous', 'vampire', 'i', 'had', 'the', 'good', 'luck', 'to', 'see', 'it', 'on', 'a', 'big', 'screen', 'and', 'to', 'find', 'a', 'video', 'to', 'watch', 'again', 'and', 'again', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<

In [66]:
import numpy as np

def make_label(label):
  if label == 1:
    return [1.0, 0.0]
  return [0.0, 1.0]

train_data_features = torch.LongTensor([vocab(tokenizer(i[1]))[::-1] for i in train_data])
train_data_labels = torch.Tensor([make_label(i[0]) for i in train_data])

test_data_features = torch.LongTensor([vocab(tokenizer(i[1]))[::-1] for i in test_data])
test_data_labels = torch.Tensor([make_label(i[0]) for i in test_data])

In [67]:
train_data_features.shape, train_data_labels.shape

(torch.Size([25000, 500]), torch.Size([25000, 2]))

In [68]:
test_data_features.shape, test_data_labels.shape

(torch.Size([25000, 500]), torch.Size([25000, 2]))

In [75]:
import numpy as np
import torch
import torch.nn as nn
import torchmetrics
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import pytorch_lightning as pl

EMBED_LEN = 32
HIDDEN_DIM = 64
NUM_LAYERS = 1

class LSTMNet(pl.LightningModule):

    def __init__(self, num_target_classes):
        super(LSTMNet, self).__init__()
        self.criterion = torch.nn.CrossEntropyLoss()
        self.train_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=2)
        self.valid_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=2)
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=EMBED_LEN)
        self.lstm = nn.LSTM(input_size=EMBED_LEN, hidden_size=HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True)
        self.linear1 = nn.Linear(HIDDEN_DIM, HIDDEN_DIM//2)
        self.linear2 = nn.Linear(HIDDEN_DIM//2, num_target_classes)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        # hidden, carry = torch.randn(NUM_LAYERS, len(X_batch), HIDDEN_DIM), torch.randn(NUM_LAYERS, len(X_batch), HIDDEN_DIM)
        output, (hidden, carry) = self.lstm(embeddings) #, (hidden, carry))
        x = F.relu(self.linear1(hidden[-1,:,:])) # output[:,-1]
        return F.sigmoid(self.linear2(x))

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        accuracy = self.train_acc(logits.round(), y)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('train_acc', accuracy, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        accuracy = self.valid_acc(logits.round(), y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_acc', accuracy, on_epoch=True)

    def configure_optimizers(self):
        opt = torch.optim.Adam(self.parameters(), lr=1e-3)
        return opt

lstm = LSTMNet(2)
print(lstm)

LSTMNet(
  (criterion): CrossEntropyLoss()
  (train_acc): MulticlassAccuracy()
  (valid_acc): MulticlassAccuracy()
  (embedding_layer): Embedding(196750, 32)
  (lstm): LSTM(32, 64, batch_first=True)
  (linear1): Linear(in_features=64, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=2, bias=True)
)


In [76]:
class CustomDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

  def __len__(self):
    return len(self.X)

train_dataset = CustomDataset(train_data_features, train_data_labels)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

test_dataset = CustomDataset(test_data_features, test_data_labels)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [85]:
trainer = pl.Trainer(max_epochs=25, accelerator="gpu", devices=[0])

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [78]:
trainer.validate(lstm, test_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.6932412385940552, 'val_acc': 0.5}]

In [86]:
trainer.fit(lstm, train_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | criterion       | CrossEntropyLoss   | 0     
1 | train_acc       | MulticlassAccuracy | 0     
2 | valid_acc       | MulticlassAccuracy | 0     
3 | embedding_layer | Embedding          | 6.3 M 
4 | lstm            | LSTM               | 25.1 K
5 | linear1         | Linear             | 2.1 K 
6 | linear2         | Linear             | 66    
-------------------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.293    Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=25` reached.


In [87]:
trainer.validate(lstm, test_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 0.5273974537849426, 'val_acc': 0.7818599939346313}]