<a href="https://colab.research.google.com/github/VictoorV/movie_classif_lstm/blob/main/Film_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torchtext==0.15.2

Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.1 (from torchtext==0.15.2)
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.1 (from torchtext==0.15.2)
  Downloading torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1->torchtex

In [2]:
pip install portalocker

Collecting portalocker
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker
Successfully installed portalocker-3.1.1


In [39]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = get_tokenizer('basic_english')

In [31]:
train_iter = IMDB(split='train')
test_iter = IMDB(split='test')

for label, text in train_iter:
    print(f"Label: {label}, Texte: {text[:200]}...\n")
    break

Label: 1, Texte: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...



In [32]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocabulary = build_vocab_from_iterator(
    yield_tokens(IMDB(split='train')),
    specials=["<pad>", "<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])

In [63]:
print(len(vocabulary))
print(vocabulary['<pad>'])
print(vocabulary["."])
print(vocabulary.get_itos()[0])

100684
0
3
<pad>


In [17]:
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

In [53]:
sequences = [
    (2, "This movie was really great !"),
    (1, "I am not sure about the scenario."),
    (1, "It was not as good as I expected"),
    (2, "The actors were good.")
]

In [60]:
def collate_batch(batch):
  labels, samples = zip(*batch)
  labels = torch.tensor(labels, dtype=torch.int64) - 1
  processed_text = [torch.tensor(vocabulary(tokenizer(sample)), dtype=torch.int64) for sample in samples]
  processed_text = pad_sequence(processed_text, batch_first=True, padding_value=0)
  return labels, processed_text

In [61]:
collate_batch(sequences)

(tensor([1, 0, 0, 1]),
 tensor([[  14,   21,   17,   72,   93,   36,    0,    0],
         [  13,  246,   29,  254,   50,    2, 2652,    3],
         [  11,   17,   29,   18,   57,   18,   13,  853],
         [   2,  162,   77,   57,    3,    0,    0,    0]]))

In [64]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)

In [None]:
class LSTMModel(torch.nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 num_classes):
        super().__init__()

        # Embedding field
        self.embedding = torch.nn.EmbeddingBag(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size)

        # LSTM cell
        self.rnn = torch.nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size)

        # Fully connected output
        self.fc = torch.nn.Linear(
            hidden_size, num_classes)

    def forward(self, text_sequence, offsets):
        # Extract embedding vectors
        embeddings = self.embedding(
            text_sequence, offsets)

        h_t, c_t = self.rnn(embeddings)

        return self.fc(h_t)

In [None]:
model = LSTMModel(
    vocab_size=len(vocabulary),
    embedding_size=64,
    hidden_size=64,
    num_classes=2)
model.to(device)

LSTMModel(
  (embedding): EmbeddingBag(68811, 64, mode='mean')
  (rnn): LSTM(64, 64)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
model(samples, offsets)

tensor([[-0.0258, -0.1070],
        [-0.0338, -0.1114],
        [-0.0308, -0.1197]], grad_fn=<AddmmBackward0>)

In [None]:
embedding = torch.nn.EmbeddingBag(
            num_embeddings=len(vocabulary),
            embedding_dim=64)

In [None]:
sentence1 = ["i"]
sentence2 = ["o"]
emb1 = (embedding(torch.tensor([vocabulary[word] for word in sentence1]), torch.tensor([0])) + embedding(torch.tensor([vocabulary[word] for word in sentence2]), torch.tensor([0]))) / 2
emb1

tensor([[-1.8664, -0.4567,  0.3141, -0.1556,  0.0163,  1.3712, -0.1701,  0.2778,
         -0.4237,  0.2656, -0.5882, -1.1098, -0.0335, -0.2935, -0.3800, -0.3045,
          0.0258,  0.1059,  0.6513, -0.5410, -0.0226,  0.2351, -0.2023,  0.2163,
          0.4305, -0.2674, -0.0916, -0.6420,  1.0534, -0.3949,  0.2352,  0.0269,
         -0.5869, -0.3753,  0.2741,  0.2824,  0.3127, -0.3130,  0.4537,  0.2205,
         -1.0100, -0.3071, -0.6559, -0.3089, -0.1188, -0.1077,  0.8178, -0.5964,
         -0.9895, -0.9986,  0.6122,  1.3127, -0.3148, -0.5196,  0.0114,  0.1211,
          0.3754,  0.2024, -0.2701, -0.0778,  0.4079,  0.2358, -0.5978,  0.4146]],
       grad_fn=<DivBackward0>)

In [None]:
sentence = ["i", "o"]
emb2 = embedding(torch.tensor([vocabulary[word] for word in sentence]), torch.tensor([0]))
emb2

tensor([[-1.8664, -0.4567,  0.3141, -0.1556,  0.0163,  1.3712, -0.1701,  0.2778,
         -0.4237,  0.2656, -0.5882, -1.1098, -0.0335, -0.2935, -0.3800, -0.3045,
          0.0258,  0.1059,  0.6513, -0.5410, -0.0226,  0.2351, -0.2023,  0.2163,
          0.4305, -0.2674, -0.0916, -0.6420,  1.0534, -0.3949,  0.2352,  0.0269,
         -0.5869, -0.3753,  0.2741,  0.2824,  0.3127, -0.3130,  0.4537,  0.2205,
         -1.0100, -0.3071, -0.6559, -0.3089, -0.1188, -0.1077,  0.8178, -0.5964,
         -0.9895, -0.9986,  0.6122,  1.3127, -0.3148, -0.5196,  0.0114,  0.1211,
          0.3754,  0.2024, -0.2701, -0.0778,  0.4079,  0.2358, -0.5978,  0.4146]],
       grad_fn=<EmbeddingBagBackward0>)

In [None]:
emb3 = embedding(samples, offsets)
emb3.shape

torch.Size([3, 64])

In [None]:
model = torch.nn.LSTM(
            input_size=64,
            hidden_size=64)

In [None]:
model(emb3)

(tensor([[ 0.0115,  0.0661, -0.0380,  0.0048,  0.0971, -0.0240,  0.0258, -0.0097,
           0.0539,  0.1063,  0.0462,  0.0199,  0.0019,  0.0048, -0.0120,  0.0817,
           0.0036, -0.0164, -0.0205, -0.1330,  0.0435,  0.0163, -0.0500, -0.0859,
          -0.0403, -0.0006,  0.0363,  0.0721,  0.0613,  0.0279,  0.0826,  0.0775,
           0.0979,  0.0611, -0.0626,  0.0768,  0.0114,  0.0166,  0.0557, -0.0022,
          -0.0427,  0.0508, -0.0753, -0.0155, -0.0181,  0.0291,  0.0247,  0.0293,
          -0.0934,  0.0317,  0.1167,  0.0143,  0.0903,  0.0906, -0.1311,  0.0290,
           0.0263, -0.0328, -0.0298, -0.0304, -0.0500, -0.0012, -0.0649,  0.0244],
         [ 0.0168,  0.1037, -0.0860,  0.0758,  0.1072, -0.0474, -0.0260, -0.0338,
          -0.0400,  0.0824,  0.0333,  0.0111,  0.0433, -0.0592, -0.0192,  0.0420,
           0.0507, -0.0307,  0.0935, -0.0480,  0.0145,  0.0184, -0.0645, -0.0929,
           0.0341, -0.0146,  0.0212,  0.1205,  0.1311,  0.0501,  0.0434,  0.1144,
           0.14