In [3]:
%run w2d1.ipynb

In [4]:
from collections import OrderedDict
from typing import Callable, Dict, Optional, List, Tuple

import torch as t
from torch import nn
import torch.nn.functional as F
from torch import einsum
from einops import rearrange, reduce, repeat
import bert_tests
import matplotlib.pyplot as plt

## Tokenization

In [5]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
print(tokenizer("hello what's up"))
uncased_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
print(uncased_tokenizer(["hello what's up"]))
coded = uncased_tokenizer(["hello what's up"])
uncased_tokenizer.batch_decode(coded['input_ids'])
tokenizer.batch_decode(coded['input_ids'])
uncased_tokenizer.batch_decode(coded['input_ids'])


{'input_ids': [101, 19082, 1184, 112, 188, 1146, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [[101, 7592, 2054, 1005, 1055, 2039, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}


["[CLS] hello what's up [SEP]"]

## Inference

In [6]:
my_bert, pretrained_bert = load_pretrained_bert()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def feed_bert(model: nn.Module, text: str, tokenizer, top_k: int = 10):
    input_ids: List[int] = tokenizer(text)["input_ids"]
    mask_idxs = [idx for idx, token in enumerate(input_ids) if token == 103]

    all_logits = model(t.tensor([input_ids], dtype=t.long))[0]

    print(text)
    for mask_idx in mask_idxs:
        logits = all_logits[mask_idx]
        probs = t.softmax(logits, dim=0)

        top_logit_idxs = t.argsort(logits, descending=True)[:top_k]
        top_logit_words = tokenizer.decode(top_logit_idxs)

        print(top_logit_words)
        print(probs[top_logit_idxs])
        print()

my_bert.eval()
feed_bert(my_bert, "The fish loves to eat [MASK].", tokenizer, top_k=20)
feed_bert(my_bert, "The fish loves to eat [MASK]", tokenizer, top_k=20)
#feed_bert(my_bert, "The vegetarian fish loves to eat [MASK].", tokenizer, top_k=20)
#feed_bert(my_bert, "The meat-eating fish loves to eat [MASK].", tokenizer, top_k=20)
#feed_bert(my_bert, "The tiny fish loves to eat [MASK].", tokenizer, top_k=20)


The fish loves to eat [MASK].
it fish them meat food eggs honey insects too rice everything water vegetables this fruit apples him there again here
tensor([0.1738, 0.0980, 0.0947, 0.0410, 0.0336, 0.0251, 0.0134, 0.0130, 0.0126,
        0.0119, 0.0092, 0.0090, 0.0088, 0.0083, 0.0072, 0.0069, 0.0063, 0.0060,
        0.0058, 0.0054], grad_fn=<IndexBackward0>)

The fish loves to eat [MASK]
. ;!?..., : | and " but - so । because as [UNK]') with
tensor([9.4125e-01, 4.6098e-02, 1.1822e-02, 4.5820e-04, 1.2235e-04, 5.4506e-05,
        3.6213e-05, 1.6483e-05, 1.2279e-05, 9.2127e-06, 6.5461e-06, 4.6536e-06,
        3.4753e-06, 3.3669e-06, 2.9931e-06, 2.4598e-06, 1.9791e-06, 1.7764e-06,
        1.3952e-06, 1.0635e-06], grad_fn=<IndexBackward0>)



## Fine tuning

In [8]:
bert_tests.test_bert_classification(Bert)

bert MATCH!!!!!!!!
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.432 0.1186 -0.7165 -0.5261 0.4967 1.223 0.3165 -0.3247 -0.5716...]
bert MATCH!!!!!!!!
 SHAPE (1, 2) MEAN: 0.09479 STD: 1.411 VALS [-0.903 1.093]


In [9]:
import torchtext
data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))

data_train = list(data_train)[:10]
data_test = list(data_test)[:10]

In [12]:
def get_imdb_collate_fn(
    max_seq_length: int,
    tokenizer: transformers.AutoTokenizer,
    device: str,
):
    def fn(raw_xs: List[Tuple[str, str]]) -> Tuple[t.Tensor, t.Tensor]:
        labels: List[str]
        texts: List[str]
        labels, texts = zip(*raw_xs)

        xs = t.tensor(
            tokenizer(
                texts,
                padding="longest",
                max_length=max_seq_length,
                truncation=True,
            ),
            dtype=t.long,
            device=device,
        )

        ys = t.tensor([int(l == "pos") for l in labels], dtype=t.long, device=device)

        return xs, ys

    return fn


In [14]:
from torch.utils.data import DataLoader
dl_train = DataLoader(data_train, batch_size=2, collate_fn=imdb_collate_fn)

for x in dl_train:
    print(len(x))
    print(x)

    break

2
[('neg', 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far 

In [78]:
next(data_train)

('neg',
 'When I first saw a glimpse of this movie, I quickly noticed the actress who was playing the role of Lucille Ball. Rachel York\'s portrayal of Lucy is absolutely awful. Lucille Ball was an astounding comedian with incredible talent. To think about a legend like Lucille Ball being portrayed the way she was in the movie is horrendous. I cannot believe out of all the actresses in the world who could play a much better Lucy, the producers decided to get Rachel York. She might be a good actress in other roles but to play the role of Lucille Ball is tough. It is pretty hard to find someone who could resemble Lucille Ball, but they could at least find someone a bit similar in looks and talent. If you noticed York\'s portrayal of Lucy in episodes of I Love Lucy like the chocolate factory or vitavetavegamin, nothing is similar in any way-her expression, voice, or movement.<br /><br />To top it all off, Danny Pino playing Desi Arnaz is horrible. Pino does not qualify to play as Ricky. H