In [26]:
import torch
from torch import nn
import numpy as np
import pandas as pd

import random
import math
from typing import Tuple, List
from tqdm.notebook import tqdm

import sys
sys.path.insert(0, '..')

from data_setup import IMDBMaskedBertDataset
from model import BERTMaskedLM
from scripts.scripts import Embedding, create_padding_mask

from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [27]:
URL = "https://github.com/SK7here/Movie-Review-Sentiment-Analysis/raw/master/IMDB-Dataset.csv"

In [28]:
df = pd.read_csv(URL, encoding="utf-8",
                 on_bad_lines="skip")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [29]:
import json
with open("../config/base_config.json", "r") as f:
    config = json.load(f)

In [30]:
config

{'batch_size': 64,
 'block_size': 64,
 'context_length': 512,
 'd_model': 384,
 'dropout': 0.1,
 'epochs': 10,
 'head_dim': 64,
 'learning_rate': 3e-05,
 'n_decoders': 6,
 'n_encoders': 6,
 'n_heads': 6}

## Training

In [31]:
PAD_TOKEN = 0
CLS_TOKEN = 1
SEP_TOKEN = 2
MASK_TOKEN = 3
UNK_TOKEN = 4

In [33]:
config["epochs"] = 2

In [34]:
def train_masked_lm(bert: BERTMaskedLM,
                    data_loader: DataLoader,
                    optimizer: torch.optim.Optimizer,
                    device: torch.device="cpu") -> Tuple[List[int], List[int]]:

    losses = []
    accs = []
    bert = bert.to(device)
    bert.train()
    for epoch in tqdm(range(1, config["epochs"]+1)):
        epoch_loss = 0
        epoch_acc = 0
        for batch, (sentence, masked_token, masked_token_idx) in tqdm(enumerate(data_loader)):
            sentence = sentence.to(device)
            masked_token = masked_token.to(device)
            masked_token_idx = masked_token_idx.to(device)

            logits, loss = bert(sentence, masked_token, masked_token_idx)
            epoch_loss += loss.item()
            epoch_acc += (logits.argmax(dim=-1).squeeze()==masked_token).sum()/config["batch_size"]

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

        epoch_loss /= len(data_loader)
        epoch_acc /= len(data_loader)
        print(f"epoch {epoch}: loss: {epoch_loss:.4f} acc: {round(epoch_acc.item()*100, 2)}%")
        losses.append(epoch_loss)
        accs.append(epoch_acc)

    return losses, accs

In [35]:
try:
  from torchinfo import summary
except:
  !pip install -q torchinfo
  from torchinfo import summary

In [36]:
masked_ds = IMDBMaskedBertDataset(path=URL)

In [37]:
vocab_size = len(masked_ds.vocab)

In [38]:
data_loader = DataLoader(dataset=masked_ds,
                         batch_size=config["batch_size"],
                         shuffle=True)
bert_embedding = Embedding(vocab_size=vocab_size,
                           config=config)
bert_masked_lm = BERTMaskedLM(config=config,
                              vocab_size=vocab_size)
optimizer = torch.optim.AdamW(params=bert_masked_lm.parameters(),
                             lr=config["learning_rate"])

In [39]:
summary(model=bert_masked_lm)

Layer (type:depth-idx)                             Param #
BERTMaskedLM                                       --
├─Embedding: 1-1                                   --
│    └─Embedding: 2-1                              6,350,592
│    └─Embedding: 2-2                              196,608
│    └─Dropout: 2-3                                --
├─Encoder: 1-2                                     --
│    └─ModuleList: 2-4                             --
│    │    └─EncoderBlock: 3-1                      1,774,464
│    │    └─EncoderBlock: 3-2                      1,774,464
│    │    └─EncoderBlock: 3-3                      1,774,464
│    │    └─EncoderBlock: 3-4                      1,774,464
│    │    └─EncoderBlock: 3-5                      1,774,464
│    │    └─EncoderBlock: 3-6                      1,774,464
├─Linear: 1-3                                      6,367,130
Total params: 23,561,114
Trainable params: 23,561,114
Non-trainable params: 0

In [40]:
losses, accs = train_masked_lm(bert=bert_masked_lm,
                               data_loader=data_loader,
                               optimizer=optimizer)

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

TypeError: BERTMaskedLM.forward() missing 1 required positional argument: 'masked_idx'

In [48]:
from scripts.scripts import Encoder
from torch.nn import functional as F

In [46]:
class BERTMaskedLM(nn.Module):

    PAD_TOKEN = 0
    CLS_TOKEN = 1
    SEP_TOKEN = 2
    MASK_TOKEN = 3
    UNK_TOKEN = 4

    def __init__(self,
                 config,
                 vocab_size: int=1000) -> None:
        super().__init__()
        self.d_model = config["d_model"]
        self.n_heads = config["n_heads"]
        self.n_layers = config["n_encoders"]
        self.embedding = Embedding(config=config,
                                   vocab_size=vocab_size)
        self.bert = Encoder(config=config)
        self.masked_lm = nn.Linear(in_features=self.d_model,
                                      out_features=vocab_size)
        
    

    def __repr__(self) -> str:
        return f"BERT(num_layers={self.n_layers}, d_model={self.d_model}, num_heads={self.n_heads})"
    
    def __str__(self) -> str:
        return f"BERT(num_layers={self.n_layers}, d_model={self.d_model}, num_heads={self.n_heads})"

    def forward(self,
                x: torch.Tensor,
                masked_tokens: torch.Tensor,
                masked_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        
        mask = create_padding_mask(batch=x, padding_idx=self.PAD_TOKEN)

        # x -> [B, S]
        x = self.embedding(x) # [B, S, D_MODEL]
        x = self.bert(x, mask) # [B, S, D_MODEL]
        x = x[range(len(masked_idx)), masked_idx, :].squeeze() # B, D_MODEL
        logits = self.masked_lm(x) # B, VOCAB_SIZE

        loss = F.cross_entropy(logits, masked_tokens.squeeze())

        return logits, loss
        

In [47]:
bert = BERTMaskedLM(config=config,
                    vocab_size=vocab_size)
with torch.inference_mode():
    logits, losses = bert.forward(torch.randint(low=0, high=10, size=(config["batch_size"], 5)),
                                  masked_tokens=torch.randint(low=0, high=5, size=(config["batch_size"],)),
                                  masked_idx=torch.randint(low=0, high=10, size=(config["batch_size"],)))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous