In [1]:
! pip install torchtext==0.6.0
# ! pip install torchtext==0.12.0
! pip install datasets
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m312.3 kB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.1
    Uninstalling torchtext-0.15.1:
      Successfully uninstalled torchtext-0.15.1
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [84]:
from datasets import load_dataset

# from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModel, AutoConfig

import torch
# from torchtext.legacy import data
from torchtext import data

In [85]:
# choose configurations for the model
# MODEL_CONFIG="LSTM"
MODEL_CONFIG="BERT"
# BERT_TYPE="google/bert_uncased_L-4_H-256_A-4"
BERT_TYPE="prajjwal1/bert-tiny"

print("Using MODEL_CONFIG", MODEL_CONFIG)

Using MODEL_CONFIG BERT


In [86]:
PROJECT_ROOT = F"/content/gdrive/My Drive/nlp_project_task_1_BERT/"
                                          

In [87]:
SEED = 42

In [88]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [89]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [90]:
faithdial_dataset = load_dataset("McGill-NLP/FaithDial")



  0%|          | 0/7 [00:00<?, ?it/s]

In [91]:
faithdial_dataset.keys()

dict_keys(['test', 'test_random_split', 'test_topic_split', 'train', 'validation', 'valid_random_split', 'valid_topic_split'])

In [92]:
faithdial_dataset["train"][0]

{'dialog_idx': 0,
 'response': 'Yeah, but once the access to the internet was a rare thing. do you remember?',
 'original_response': "No I could not! I couldn't imagine living when internet access was rare and very few people had it!",
 'history': ['Can you imagine the world without internet access?'],
 'knowledge': 'Internet access was once rare, but has grown rapidly.',
 'BEGIN': ['Hallucination'],
 'VRM': ['Disclosure', 'Ack.']}

In [93]:
def critic_preprocess(dataset):
    """
    Data items transformed into (knowledge, response, is_hallucination)
    """
    new_dataset = []
    for d in dataset:
        # original response
        if d["original_response"] != None:
            new_dataset.append({
                "knowledge": d["knowledge"],
                "response": d["original_response"],
                "hallucination": "yes" if "Hallucination" in d["BEGIN"] else "no",
                "history": " ".join(d["history"]) # separate histories by ' '
                                                  # (i.e., will be a seq of sentences)
            })

        # new responses always aren't hallucinations
        new_dataset.append({"knowledge": d["knowledge"], "response": d["response"], "hallucination": "no", "history": "\\".join(d["history"])})
    return new_dataset

In [94]:
import json

def dump_as_json(dataset, filename):
    """
    Takes a list of dicts and dumps it as a json file that torchtext can parse.
    """
    with open(filename, "w") as file:
        for d in dataset:
            file.write(json.dumps(d))
            file.write("\n")


In [95]:
if MODEL_CONFIG == "LSTM":
    KNOWLEDGE = data.Field(tokenize='spacy', tokenizer_language="en_core_web_sm", include_lengths=True)
    RESPONSE = data.Field(tokenize='spacy', tokenizer_language="en_core_web_sm", include_lengths=True)
    HISTORY = data.Field(tokenize='spacy', tokenizer_language="en_core_web_sm", include_lengths=True)
    LABEL = data.LabelField(dtype=torch.float)

In [96]:
if MODEL_CONFIG == "BERT":
    tokenizer = AutoTokenizer.from_pretrained('google/bert_uncased_L-4_H-256_A-4', do_lower_case=True)
    pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    unk_index = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    KNOWLEDGE = data.Field(use_vocab=False,
                            tokenize=tokenizer.encode,
                            pad_token=pad_index,
                            unk_token=unk_index,
                            include_lengths = True)
    RESPONSE = data.Field(use_vocab=False,
                          tokenize=tokenizer.encode,
                          pad_token=pad_index,
                          unk_token=unk_index,
                          include_lengths=True)
    HISTORY = data.Field(use_vocab=False,
                         tokenize=tokenizer.encode,
                         pad_token=pad_index,
                         unk_token=unk_index,
                         include_lengths=True)
    LABEL = data.LabelField(dtype=torch.float)

In [97]:
dump_as_json(critic_preprocess(faithdial_dataset["test"]), PROJECT_ROOT + "data/faithdial_dataset_test.json")
dump_as_json(critic_preprocess(faithdial_dataset["train"]), PROJECT_ROOT + "data/faithdial_dataset_train.json")
dump_as_json(critic_preprocess(faithdial_dataset["validation"]), PROJECT_ROOT + "data/faithdial_dataset_validation.json")

In [98]:
fields = {"knowledge": ("k", KNOWLEDGE), "response": ("r", RESPONSE), "hallucination": ("l", LABEL), "history": ("h", HISTORY)}

dataset = data.TabularDataset.splits(path=PROJECT_ROOT + "data",
                                     train="faithdial_dataset_train.json",
                                     validation="faithdial_dataset_validation.json",
                                     test="faithdial_dataset_test.json",
                                     format="json",
                                     fields=fields)


In [99]:
train_data, valid_data, test_data = dataset

In [100]:
train_data[0]

<torchtext.data.example.Example at 0x7f85552312a0>

In [101]:
vars(train_data.examples[0])

{'k': [101,
  4274,
  3229,
  2001,
  2320,
  4678,
  1010,
  2021,
  2038,
  4961,
  5901,
  1012,
  102],
 'r': [101,
  2053,
  1045,
  2071,
  2025,
  999,
  1045,
  2481,
  1005,
  1056,
  5674,
  2542,
  2043,
  4274,
  3229,
  2001,
  4678,
  1998,
  2200,
  2261,
  2111,
  2018,
  2009,
  999,
  102],
 'l': 'yes',
 'h': [101, 2064, 2017, 5674, 1996, 2088, 2302, 4274, 3229, 1029, 102]}

In [102]:
if MODEL_CONFIG == "LSTM":
    MAX_VOCAB_SIZE = 25_000
    
    KNOWLEDGE.build_vocab(train_data,
                        max_size=MAX_VOCAB_SIZE,
                        vectors="fasttext.simple.300d",
                        unk_init=torch.Tensor.normal_)
    RESPONSE.build_vocab(train_data,
                        max_size=MAX_VOCAB_SIZE,
                        vectors="fasttext.simple.300d",
                        unk_init=torch.Tensor.normal_)
    HISTORY.build_vocab(train_data,
                        max_size=MAX_VOCAB_SIZE,
                        vectors="fasttext.simple.300d",
                        unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(train_data)


In [103]:
if MODEL_CONFIG == "BERT":
    # vocab already built -- don't need to do anything except for labels
    LABEL.build_vocab(train_data)
    pass

In [104]:
# print(f"Unique tokens in KNOWLEDGE vocabulary: {len(KNOWLEDGE.vocab)}")
# print(f"Unique tokens in RESPONSE vocabulary: {len(RESPONSE.vocab)}")
# print(f"Unique tokens in HISTORY vocabulary: {len(HISTORY.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in LABEL vocabulary: 2


In [105]:
# print(KNOWLEDGE.vocab.freqs.most_common(20))
# print(RESPONSE.vocab.freqs.most_common(20))
# print(HISTORY.vocab.freqs.most_common(20))
print(LABEL.vocab.freqs.most_common(20))

[('no', 20474), ('yes', 13507)]


In [106]:
# print(KNOWLEDGE.vocab.itos[:10])
# print(RESPONSE.vocab.itos[:10])
# print(HISTORY.vocab.itos[:10])
print(LABEL.vocab.itos[:10])

['no', 'yes']


In [107]:
BATCH_SIZE = 8

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: x.r,
    device = device)

In [108]:
from torch import nn

class LSTM(nn.Module):
    def __init__(self, response_vocab_size, knowledge_vocab_size, history_vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, response_pad_idx, knowledge_pad_idx, history_pad_idx):

        super().__init__()

        # Initialize Embedding Layer
        self.response_embedding = nn.Embedding(num_embeddings=response_vocab_size,
                                               embedding_dim=embedding_dim,
                                               padding_idx=response_pad_idx)

        self.knowledge_embedding = nn.Embedding(num_embeddings=knowledge_vocab_size,
                                                embedding_dim=embedding_dim,
                                                padding_idx=knowledge_pad_idx)
        
        self.history_embedding = nn.Embedding(num_embeddings=history_vocab_size,
                                              embedding_dim=embedding_dim,
                                              padding_idx=history_pad_idx)

        # Initialize LSTM layer
        self.response_lstm = nn.LSTM(input_size=embedding_dim,
                                     hidden_size=hidden_dim,
                                     num_layers=n_layers,
                                     bidirectional=bidirectional)

        self.knowledge_lstm = nn.LSTM(input_size=embedding_dim,
                                      hidden_size=hidden_dim,
                                      num_layers=n_layers,
                                      bidirectional=bidirectional)
        
        self.history_lstm = nn.LSTM(input_size=embedding_dim,
                                    hidden_size=hidden_dim,
                                    num_layers=n_layers,
                                    bidirectional=bidirectional)

        # Initialize a fully connected layer with Linear transformation
        self.fc = nn.Linear(in_features=3*2*hidden_dim,
                            out_features=output_dim)

        # Initialize Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, response, response_lengths, knowledge, knowledge_lengths, history, history_lengths):
        # Apply embedding layer that matches each word to its vector and apply dropout. Dim [sent_len, batch_size, emb_dim]
        x_r = self.response_embedding(response)
        x_r = self.dropout(x_r)

        x_k = self.knowledge_embedding(knowledge)
        x_k = self.dropout(x_k)

        x_h = self.history_embedding(history)
        x_h = self.dropout(x_h)

        # Run the LSTM along the sentences of length sent_len.
        output_r, (hidden_r, cell_r) = self.response_lstm(x_r)
        output_k, (hidden_k, cell_k) = self.knowledge_lstm(x_k)
        output_h, (hidden_h, cell_h) = self.history_lstm(x_h)

        # Concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers and apply dropout
        hidden_r = torch.cat((hidden_r[-2,:,:], hidden_r[-1,:,:]), -1)
        hidden_k = torch.cat((hidden_k[-2,:,:], hidden_k[-1,:,:]), -1)
        hidden_h = torch.cat((hidden_h[-2,:,:], hidden_h[-1,:,:]), -1)
        hidden = torch.cat((hidden_r, hidden_k, hidden_h), -1)
        hidden = self.dropout(hidden)

        return self.fc(hidden)

In [109]:
from torch import nn
import transformers

class Transformer(nn.Module):
    def __init__(self, output_dim, dropout):

        super().__init__()

        # Initialize Bert layer
        config = transformers.AutoConfig.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
        self.response_bert = transformers.AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
        self.knowledge_bert = transformers.AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
        self.history_bert = transformers.AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")

        self.fc = nn.Linear(in_features=3*config.hidden_size,
                            out_features=output_dim)

        # Initialize Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, response, response_lengths, knowledge, knowledge_lengths, history, history_lengths):

        # swap LENGTH and BATCH_SIZE dimensions
        response = torch.reshape(response, (response.shape[1], response.shape[0]))
        knowledge = torch.reshape(knowledge, (knowledge.shape[1], knowledge.shape[0]))
        history = torch.reshape(history, (history.shape[1], history.shape[0]))

        output_r = self.response_bert(response).pooler_output
        output_k = self.knowledge_bert(knowledge).pooler_output
        output_h = self.history_bert(history).pooler_output

        hidden = torch.cat((output_r, output_k, output_h), -1)
        hidden = self.dropout(hidden)

        return self.fc(hidden)

In [110]:
# use original LSTM
if MODEL_CONFIG == "LSTM":
    RESPONSE_INPUT_DIM = len(RESPONSE.vocab)
    KNOWLEDGE_INPUT_DIM = len(KNOWLEDGE.vocab)
    HISTORY_INPUT_DIM = len(HISTORY.vocab)
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 256
    OUTPUT_DIM = 1
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.5
    RESPONSE_PAD_IDX = RESPONSE.vocab.stoi[RESPONSE.pad_token]
    KNOWLEDGE_PAD_IDX = KNOWLEDGE.vocab.stoi[KNOWLEDGE.pad_token]
    HISTORY_PAD_IDX = HISTORY.vocab.stoi[HISTORY.pad_token]
    
    model = LSTM(RESPONSE_INPUT_DIM,
                KNOWLEDGE_INPUT_DIM,
                HISTORY_INPUT_DIM,
                EMBEDDING_DIM,
                HIDDEN_DIM,
                OUTPUT_DIM,
                N_LAYERS,
                BIDIRECTIONAL,
                DROPOUT,
                RESPONSE_PAD_IDX,
                KNOWLEDGE_PAD_IDX,
                HISTORY_PAD_IDX)

In [111]:
if MODEL_CONFIG == "BERT":
    OUTPUT_DIM=1
    DROPOUT = 0.5
    model = Transformer(OUTPUT_DIM, DROPOUT) 

Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing B

In [112]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 33,512,449 trainable parameters


In [113]:
# print(RESPONSE.vocab.vectors.shape)
# print(KNOWLEDGE.vocab.vectors.shape)
# print(HISTORY.vocab.vectors.shape)

In [114]:
if MODEL_CONFIG == "LSTM":
    model.response_embedding.weight.data.copy_(RESPONSE.vocab.vectors)
    model.knowledge_embedding.weight.data.copy_(KNOWLEDGE.vocab.vectors)
    model.history_embedding.weight.data.copy_(HISTORY.vocab.vectors)

In [115]:
if MODEL_CONFIG == "LSTM":
    UNK_IDX_R = RESPONSE.vocab.stoi[RESPONSE.unk_token]
    UNK_IDX_K = RESPONSE.vocab.stoi[KNOWLEDGE.unk_token]
    UNK_IDX_H = RESPONSE.vocab.stoi[HISTORY.unk_token]

    model.response_embedding.weight.data[UNK_IDX_R] = torch.zeros(EMBEDDING_DIM)
    model.response_embedding.weight.data[RESPONSE_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    model.knowledge_embedding.weight.data[UNK_IDX_K] = torch.zeros(EMBEDDING_DIM)
    model.knowledge_embedding.weight.data[KNOWLEDGE_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    model.history_embedding.weight.data[UNK_IDX_H] = torch.zeros(EMBEDDING_DIM)
    model.history_embedding.weight.data[HISTORY_PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    print(model.response_embedding.weight.data)
    print(model.knowledge_embedding.weight.data)
    print(model.history_embedding.weight.data)

In [116]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [117]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [118]:
from sklearn.metrics import f1_score


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


def binary_f1(preds, y):
    # print(preds)
    # print(y)
    rounded_preds = torch.round(torch.sigmoid(preds))
    # f1 = f1_score(rounded_preds.detach().cpu(), y.detach().cpu(), average="macro")
    # f1 = f1_score(y.detach().cpu(), rounded_preds.detatch().cpu(), average="macro")
    f1 = f1_score(y.cpu(), rounded_preds.cpu(), average="macro")

    return f1


In [119]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        response, response_lengths = batch.r
        knowledge, knowledge_lengths = batch.k
        history, history_lengths = batch.h

        predictions = model(response, response_lengths, knowledge, knowledge_lengths, history, history_lengths).squeeze(1)

        loss = criterion(predictions, batch.l)

        acc = binary_accuracy(predictions, batch.l)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [120]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:
            response, response_lengths = batch.r
            knowledge, knowledge_lengths = batch.k
            history, history_lengths = batch.h

            predictions = model(response, response_lengths, knowledge, knowledge_lengths, history, history_lengths).squeeze(1)

            loss = criterion(predictions, batch.l)
            acc = binary_accuracy(predictions, batch.l)
            f1 = binary_f1(predictions, batch.l)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

In [121]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [122]:
N_EPOCHS = 5
# path = F"/content/gdrive/My Drive/bilstm_model.pt"
path = PROJECT_ROOT + F"/bert_model.pt"
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_f1 = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% |')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. F1: {valid_f1:.3f}')

Epoch: 01 | Epoch Time: 3m 53s
	Train Loss: 0.711 | Train Acc: 55.76% |
	 Val. Loss: 0.673 |  Val. Acc: 60.00% | Val. F1: 0.436
Epoch: 02 | Epoch Time: 3m 53s
	Train Loss: 0.691 | Train Acc: 57.85% |
	 Val. Loss: 0.683 |  Val. Acc: 60.00% | Val. F1: 0.436


KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(torch.load(path, map_location=device))

test_loss, test_acc, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1: {test_f1:.2f}')

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_hallucination(model, knowledge, response):
    model.eval()

    tokenized_r = [tok.text for tok in nlp.tokenizer(response)]
    indexed_r = [RESPONSE.vocab.stoi[t] for t in tokenized_r]
    length_r = [len(indexed_r)]
    tensor_r = torch.LongTensor(indexed_r).to(device)
    tensor_r = tensor_r.unsqueeze(1)
    length_tensor_r = torch.LongTensor(length_r)

    tokenized_k = [tok.text for tok in nlp.tokenizer(knowledge)]
    indexed_k = [KNOWLEDGE.vocab.stoi[t] for t in tokenized_k]
    length_k = [len(indexed_k)]
    tensor_k = torch.LongTensor(indexed_k).to(device)
    tensor_k = tensor_k.unsqueeze(1)
    length_tensor_k = torch.LongTensor(length_k)

    prediction = torch.sigmoid(model(tensor_r, length_tensor_r, tensor_k, length_tensor_k))

    return prediction.item()


In [None]:
predict_hallucination(model, "", "I love dogs")

In [None]:
predict_hallucination(model, "", "Dogs are animals.")

In [None]:
predict_hallucination(model, "", "I was walking my dog last week.")

In [None]:
predict_hallucination(model, "", "Dogs need to be walked daily.")

In [None]:
test_data[2].r

In [None]:
predict_hallucination(model, "", "Dylan's Candy Bar is a candy supplier.")

In [None]:
predict_hallucination(model, "", "Dylan's Candy Bar is my favorite great brand of candy.")

In [None]:
print(test_data[2].h)

In [None]:
# test BERT
test_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
test_model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

In [None]:
inputs

In [None]:
outputs = test_model(**inputs)
outputs

In [None]:
outputs = model(inputs["input_ids"])

In [None]:
outputs["pooler_output"][0].squeeze(-1)


In [None]:
outputs["pooler_output"].squeeze(-1).size()

In [None]:
loss, output = model(inputs["input_ids"])

In [None]:
loss

In [None]:
output

In [None]:
import gc
model = None
torch.cuda.empty_cache()
gc.collect()

In [None]:
test_model(inputs.input_ids).pooler_output.size()

In [None]:
inputs.input_ids.shape

In [None]:
torch.flip(inputs.input_ids, []).shape

In [None]:
torch.reshape(inputs.input_ids, (inputs.input_ids.shape[1], inputs.input_ids.shape[0])).shape

In [None]:
torch.reshape(inputs.input_ids, (inputs.input_ids.shape[1], inputs.input_ids.shape[0]))

In [None]:
inputs.input_ids

In [None]:

BertTokenizer.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
BertModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
# BertTokenizer.from_pretrained("bert-tiny")
# BertModel.from_pretrained("bert-tiny")