<a href="https://colab.research.google.com/github/VictoorV/movie_classif_lstm/blob/main/Film_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torchtext==0.15.2

Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.1 (from torchtext==0.15.2)
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.1 (from torchtext==0.15.2)
  Downloading torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1->torchtext==0.15.2)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1->torchtex

In [2]:
pip install portalocker

Collecting portalocker
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker
Successfully installed portalocker-3.1.1


In [3]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = get_tokenizer('basic_english')

In [5]:
train_iter, test_iter = IMDB()

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocabulary = build_vocab_from_iterator(
    yield_tokens(train_iter),
    specials=["<pad>", "<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])

print(len(vocabulary))
print(vocabulary['<pad>'])
print(vocabulary.get_itos()[0])

100684
0
<pad>


In [6]:
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
print(f'Taille du train_dataset : {len(train_dataset)}')
print(f'Taille du test_dataset : {len(test_dataset)}')

Taille du train_dataset : 25000
Taille du test_dataset : 25000


In [7]:
sort = True

def collate_batch(batch, sort = sort):
  labels, samples = zip(*batch)
  labels = torch.tensor(labels, dtype=torch.int64) - 1
  text_lengths = torch.tensor([len(tokenizer(sample)) for sample in samples], dtype=torch.int64)
  inputs = [torch.tensor(vocabulary(tokenizer(sample)), dtype=torch.int64) for sample in samples]
  inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
  if sort:
    text_lengths, indices = torch.sort(text_lengths, descending=True)
    labels = labels[indices]
    inputs = inputs[indices]
  return labels, inputs, text_lengths

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=collate_batch)

In [9]:
class LSTMModel(torch.nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 num_classes,
                 dropout,
                 num_layers):
        super().__init__()

        # Embedding field
        self.embedding = torch.nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size)

        # LSTM cell
        self.rnn = torch.nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            bidirectional=True,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True)

        # Fully connected output
        self.fc = torch.nn.Linear(
            2*hidden_size, num_classes)

    def forward(self, text_tokens, text_lengths):
        # Extract embedding vectors
        embeddings = self.embedding(
            text_tokens)
        packed = pack_padded_sequence(embeddings, text_lengths, batch_first=True, enforce_sorted=sort)
        h_t, (h_n, c_n) = self.rnn(packed)

        return self.fc(torch.cat((h_n[-2],h_n[-1]),dim=1))

In [10]:
def train_model(model, cost_function, optimizer, data_loader, scheduler=None):
    model.to(device)

    model.train()

    current_loss = 0.0
    current_acc = 0.0

    for i, (labels, inputs, text_lengths) in enumerate(data_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs, text_lengths)
        _, predictions = torch.max(outputs, 1)
        loss = cost_function(outputs, labels)

        loss.backward()
        optimizer.step()

        current_loss += loss.item() * labels.size(0)
        current_acc += torch.sum(predictions == labels.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Train Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))

    if scheduler:
        scheduler.step()

def test_model(model, cost_function, data_loader):
    model.to(device)

    model.eval()

    current_loss = 0.0
    current_acc = 0.0

    for i, (labels, inputs, text_lengths) in enumerate(data_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs, text_lengths)
            _, predictions = torch.max(outputs, 1)
            loss = cost_function(outputs, labels)

        current_loss += loss.item() * labels.size(0)
        current_acc += torch.sum(predictions == labels.data)

    total_loss = current_loss / len(data_loader.dataset)
    total_acc = current_acc.double() / len(data_loader.dataset)

    print('Test Loss: {:.4f}; Accuracy: {:.4f}'.format(total_loss, total_acc))

In [29]:
model = LSTMModel(
    vocab_size=len(vocabulary),
    embedding_size=100,
    hidden_size=64,
    num_classes=2,
    num_layers=2,
    dropout=0.3)

model.to(device)

cost_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

num_epoch = 8
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=num_epoch, eta_min=1e-5)

In [30]:
for epoch in range(num_epoch):
    print(f'Epoch: {epoch + 1}')
    train_model(model, cost_fn, optim, train_dataloader, scheduler=scheduler)
    test_model(model, cost_fn, test_dataloader)

Epoch: 1
Train Loss: 0.5868; Accuracy: 0.6786
Test Loss: 0.5074; Accuracy: 0.7530
Epoch: 2
Train Loss: 0.5460; Accuracy: 0.7210
Test Loss: 0.4765; Accuracy: 0.7856
Epoch: 3
Train Loss: 0.3996; Accuracy: 0.8198
Test Loss: 0.3985; Accuracy: 0.8257
Epoch: 4
Train Loss: 0.2956; Accuracy: 0.8784
Test Loss: 0.3596; Accuracy: 0.8489
Epoch: 5
Train Loss: 0.2183; Accuracy: 0.9144
Test Loss: 0.3241; Accuracy: 0.8687
Epoch: 6
Train Loss: 0.1766; Accuracy: 0.9342
Test Loss: 0.3299; Accuracy: 0.8720
Epoch: 7
Train Loss: 0.1527; Accuracy: 0.9450
Test Loss: 0.3516; Accuracy: 0.8625
Epoch: 8
Train Loss: 0.1385; Accuracy: 0.9513
Test Loss: 0.3395; Accuracy: 0.8733


In [68]:
sequences = [
    (2, "This movie was really great!"),
    (1, "I am not sure about the scenario, the movie was empty and boring."),
    (1, "The movie was shit and I do not recommend it."),
    (2, "The actors were good, it was satisfying.")
]

labels, inputs, text_lengths = collate_batch(sequences, sort)
print(text_lengths)
print(torch.max(model(inputs.to(device), text_lengths),1)[1])

tensor([15, 11,  9,  6])
tensor([0, 0, 1, 1], device='cuda:0')


In [64]:
labels

tensor([0, 0, 1, 1])