In [1]:
import numpy as np
import torch
import transformers
import matplotlib.pyplot as plt
import time

from transformers import BertConfig, BertTokenizerFast
from transformers import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.manifold import TSNE
from tqdm.auto import tqdm

from models import BertForDiffusion, DiffusionLM, ConditionalDiffusionLM
from data_utils import load_qqp_dataset_and_tokenizer_from_disk, QQPParaphraseDataset, load_split_qqp_dataset_and_tokenizer_from_disk
from noise_schedule import get_named_beta_schedule
from train_utils import train_conditional, evaluate_conditional
from metric_utils import calculate_bleu, calculate_rouge

%matplotlib inline

In [6]:
# dataset args
max_len = 32

# training args
batch_size = 64
device = torch.device("cuda:1")
lr = 1e-4
num_epoch = 30
weight_decay = 0
num_warmup_steps = 100

# model args
word_embedding_dim = 128
# hidden_size, num_hidden_layers, num_attention_heads, intermediate_size = 768, 12, 12, 3072
hidden_size, num_hidden_layers, num_attention_heads, intermediate_size = 512, 4, 8, 2048

max_position_embeddings = max_len

encoder_type = 'from-scratch'
noise_schedule = 'sqrt'
emb_type = 'bit'
use_shared_weight=True
lm_head_bias=False
add_emb_noise=False
self_condition=True

In [7]:
train_dataset, eval_dataset, tokenizer = load_split_qqp_dataset_and_tokenizer_from_disk(data_path="data")

# tokenized_qqp_train, tokenized_qqp_eval, tokenizer = load_qqp_dataset_and_tokenizer_from_disk(data_path="data")

rev_tokenizer = {v: k for k, v in tokenizer.items()}

print("Tokenizer vocab size:", len(tokenizer))

# train_dataset = QQPParaphraseDataset(dataset=tokenized_qqp_train, random_swap=True)
print("Training set size:", len(train_dataset))
# eval_dataset = QQPParaphraseDataset(dataset=tokenized_qqp_eval, random_swap=False)
print("Evaluation set size:", len(eval_dataset))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

Tokenizer vocab size: 15672
Training set size: 120940
Evaluation set size: 13438


In [8]:
config = BertConfig(vocab_size=len(tokenizer), hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, max_position_embeddings=max_position_embeddings, pad_token_id=tokenizer['[PAD]'])

config.T = 2000
if emb_type in ['learned', 'randn']:
    config.word_embedding_dim = word_embedding_dim

print(config)

betas = torch.Tensor(get_named_beta_schedule(schedule_name=noise_schedule, num_diffusion_timesteps=config.T))

BertConfig {
  "T": 2000,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 32,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 4,
  "pad_token_id": 3,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 15672
}



In [9]:
diffusion_model = ConditionalDiffusionLM(config=config, betas=betas, use_shared_weight=use_shared_weight, lm_head_bias=lm_head_bias, add_emb_noise=add_emb_noise, conditional_gen=True,self_condition=self_condition, encoder_type=encoder_type, encoder_name_or_path='bert-base-uncased', emb_type=emb_type).to(device)

print("Diffusion model #parameters:")
print(sum([p.numel() for p in diffusion_model.parameters()]))

print("Diffusion model #trainable parameters")
print(sum([p.numel() for p in filter(lambda p:p.requires_grad, diffusion_model.parameters())]))

using bit word embedding
set word_embedding_dim to: 14
Diffusion model #parameters:
38886414
Diffusion model #trainable parameters
38886414


In [10]:
diffusion_model.load_state_dict(torch.load("checkpoints/20221103_0634"), strict=False)

<All keys matched successfully>

In [11]:
evaluate_conditional(diffusion_model=diffusion_model, dataloader=eval_dataloader,)


  0%|          | 0/210 [00:00<?, ?it/s]

eval loss=0.08785


tensor(0.0878, device='cuda:1')

In [None]:
diffusion_model.eval()

generated_questions_mbr1_ddim500 = diffusion_model.generate(
    dataset = eval_dataset,
    rev_tokenizer=rev_tokenizer,
    sampling_timesteps=500,
    eta=0,
    mbr=1,
    verbose=True,
)

  0%|          | 0/105 [00:00<?, ?it/s]

In [None]:
bleu_dict = calculate_bleu(generated_questions_mbr1_ddim500, eval_dataset, rev_tokenizer)
print(sum(bleu_dict["bleu"])/len(bleu_dict["bleu"]))
print(sum(bleu_dict["self_bleu"])/len(bleu_dict["self_bleu"]))

In [None]:
rouge_scores = calculate_rouge(generated_questions_mbr1_ddim500, eval_dataset, rev_tokenizer)
rouge_l_f = [d['rouge-l']['f'] for d in rouge_scores]
print(sum(rouge_l_f)/len(rouge_l_f))

In [25]:
i = 84
src_question = [rev_tokenizer[id.item()] for id in eval_dataset[i]['question1_input_ids']]
src_question = list(filter(lambda x:x not in ['[PAD]','[START]','[END]'], src_question))
print(" ".join(src_question))

tgt_question = [rev_tokenizer[id.item()] for id in eval_dataset[i]['question2_input_ids']]
tgt_question = list(filter(lambda x:x not in ['[PAD]','[START]','[END]'], tgt_question))
print(" ".join(tgt_question))

#print(" ".join(generated_questions_mbr1_ddim200[i]))
print(" ".join(generated_questions_mbr10_ddim2[i]))
#print(" ".join(generated_questions_mbr5_ddim20[i]))

What are some ways to get rid off addiction to WhatsApp ?
How do I get rid of my WhatsApp addiction ?
How do I get rid addiction on WhatsApp ?
