In [1]:
"""
    Sample: 
        config.data = "wiki_data, word cats and pneumonia"
        config.loss = "L1"
        config.simscore = ""
        config.batch_size
        config.epoch
        config.lr
        config.momentum (since we are using SGD)
"""
print()




In [1]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date

wandb.init(project='Synthetic Net Roberta')
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: sosig_catto (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.3 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [2]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = embed_size
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    
model = DenseNet(context_length = 10,embed_size=768)
print(model)

DenseNet(
  (act): ReLU()
  (out): Tanh()
  (hidden1): Linear(in_features=15360, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=768, bias=True)
)


In [3]:
#setup roberta
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
checkpoint='roberta-base'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)
roberta_model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
vocab = tokenizer.vocab
roberta_embeddings = roberta_model.embeddings.word_embeddings.weight
special_token_ids = tokenizer.convert_tokens_to_ids([tokenizer.pad_token,tokenizer.unk_token])

In [9]:
a = roberta_embeddings[[1,2,3]]

In [10]:
a.detach()

tensor([[ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [-0.0347, -0.0873, -0.0180,  ...,  0.1174, -0.0098, -0.0355],
        [ 0.0156, -0.0211, -0.0156,  ..., -0.0223,  0.0082, -0.0073]])

In [5]:
from util import *

#Loading the data
    
config.batch_size = 64

corpus = 'giga'

training_files = [f'../processed_data/{corpus}_only/{x}_corpus_stopwords_c10.txt' for x in ['disease','pneumonia','vaccine','virus','sick']]
training_data = load_training_batch(training_files,config.batch_size)
#config.data = "wiki_only_5words-covid"
def token_to_id(word):
    if word in vocab:
        return vocab[word]
    elif word == tokenizer.pad_token:
        return special_token_ids[0]
    else:
        return special_token_ids[1]
        
def get_roberta_embedding(train_data):
    train_tensor = []
    for i,batch in enumerate(train_data):
        batch_tensor_list = []
        batch_label_list = []
        for sentence in batch:
            try:
                y,x = sentence.split(':')
                s = re.sub('[\n\r\ ]+',' ',x).strip()
                input_ids = [token_to_id(word) for word in s.split(' ')]
                context_embeddings = roberta_embeddings[input_ids].detach()
                batch_tensor_list.append(context_embeddings)
                y_id = token_to_id(y)
                label_embedding = roberta_embeddings[y_id].detach()
                batch_label_list.append(label_embedding)
            except Exception as e:
                print(sentence)
                print(e)
                return
        train_tensor.append((batch_tensor_list,batch_label_list))
    return train_tensor

#list of tuples of tensors

train_tensor = get_roberta_embedding(training_data[:1])

In [6]:
print(len(train_tensor))   #--num batch
print(len(train_tensor[0]))   #tuple
print(train_tensor[0][0])  #input_ids
print(train_tensor[0][1])  #label_ids
print(train_tensor[0][0][0].size())  #input_ids
print(train_tensor[0][1][0].size())   #batch 0, label, first_eg of that batch 's label
print(train_tensor[0][0][0][0].size()) #batch 0, input_ids, first_eg of that batch -- 20 tensors, choose the first one


1
2
[tensor([[ 0.2461, -0.1467, -0.0417,  ..., -0.0204,  0.1219,  0.0399],
        [ 0.0466, -0.0604,  0.0635,  ...,  0.0128,  0.0179,  0.0889],
        [ 0.1561, -0.1829, -0.0062,  ...,  0.0809, -0.0939,  0.0879],
        ...,
        [ 0.0945, -0.1791, -0.0116,  ...,  0.0602,  0.1665,  0.0811],
        [ 0.0156, -0.0211, -0.0156,  ..., -0.0223,  0.0082, -0.0073],
        [ 0.0156, -0.0211, -0.0156,  ..., -0.0223,  0.0082, -0.0073]]), tensor([[ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        ...,
        [-0.0030,  0.1555, -0.0666,  ..., -0.2010,  0.0419, -0.0134],
        [-0.0785, -0.0016, -0.0026,  ..., -0.1251, -0.0093, -0.1643],
        [ 0.0807,  0.0248, -0.0842,  ...,  0.0431, -0.0494, -0.0233]]), tensor([[ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0081, -0.0156],
        [ 0.0156,  0.0076, -0.0118,  ..., -0.0022,  0.0

In [7]:
import torch.optim as optim


config.lr = 0.0005
config.momentum = 0.005
optimizer = optim.SGD(
    model.parameters(),
    lr=config.lr,momentum=config.momentum,
    weight_decay=0.01
)
criterion = nn.L1Loss()

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
#learning rate adjustment -- try 0.001



In [8]:
from tqdm import tqdm

config.epochs = 1

best_valid_loss = float('inf')

for epoch in tqdm(range(config.epochs)):   

    epoch_loss = 0
    epoch_cosim = 0
    cosim_score = 0
    model.train()
  
    for batch in iter(train_tensor):
        optimizer.zero_grad()
        features,labels = batch
        feat = torch.stack(features)
        label = torch.stack(labels)
        
        predictions = model(feat).squeeze(1)
        loss = criterion(predictions,label)  
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        t_cosim_score = np.mean([cosim(label[i].detach().numpy(),predictions[i].detach().numpy()) for i in range(config.batch_size) ])
        cosim_score += t_cosim_score
    train_loss = epoch_loss
    cosim_score = cosim_score/len(train_tensor)
    wandb.log({"loss":train_loss,"cosim_score":cosim_score})
    print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.39it/s]

Epoch:01	|	Train Loss: 0.030	|	Cosim score: -0.020





In [15]:
import os
files_available = [x[:x.find('_')] for x in os.listdir(f'../processed_data/{corpus}_only/') if x.endswith('.txt')]

In [16]:
def negative_sample(target,files,n=4):
    negatives = [x for x in files if x != target]
    return [target] + random.sample(negatives,n)

d = {x:negative_sample(x,files_available,2) for x in ['disease','pneumonia','sick']}

In [9]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,0.02975
cosim_score,-0.02038
_runtime,40.0
_timestamp,1633331448.0
_step,0.0


0,1
loss,▁
cosim_score,▁
_runtime,▁
_timestamp,▁
_step,▁


## Testing

In [10]:
model_to_test =  DenseNet(context_length = 10,embed_size=768)
model_to_test.load_state_dict(torch.load('../outputs/2021-10_giga_only_capital_spring-dust-35.pt'))

<All keys matched successfully>

In [11]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [21]:
##Testing
#Test 1 sentence
#Test 1 batch
#Test all 
#file = '../processed_data/giga_only/capital_corpus_stopwords_c10.txt'
file = '../demo/inference/clean_raffles.txt'
with open(file,'r') as f:
    infer_data = f.read().split('\n')
random_batch = [x for x in infer_data if x != '']
context_embeddings = []
for sentence in random_batch:
    y,x = sentence.split(':')
    x = re.sub('[\n\r\ ]+',' ',x).strip()

    input_ids = [token_to_id(word) for word in x.split(' ')]
    context_embedding = roberta_embeddings[input_ids].detach()
    context_embeddings.append(context_embedding)
feat = torch.stack(context_embeddings)
output = model_to_test(feat)
vec_output = torch.mean(output,dim=0).detach().numpy().reshape(1,768)

roberta_embedding_numpy = roberta_embeddings.detach().numpy()

def __distance(W, vocab, ivocab, vec_output):

    
    dist = np.dot(W, vec_output.T).squeeze(1)
    dist = 1/(1+np.exp(-dist))
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[x], dist[x]))
print(f"Test 1 -- \n\n")

__distance(roberta_embedding_numpy,vocab,{v:k for k,v in vocab.items()},vec_output)


Test 1 -- 



                               Word       Unnormalized Cosine distance

---------------------------------------------------------

0                            capital		1.000000
1                            Capital		0.999973
2                           Ġcapital		0.999890
3                           ĠCapital		0.999540
4                          Ġcapitals		0.996933
5                         capitalist		0.994241
6                          resources		0.992437
7                          financial		0.989846
8                            liberal		0.989268
9                         industrial		0.988497


In [15]:
tokenizer.tokenize('apital')

['ap', 'ital']

In [53]:
torch.mean(output,dim=0).size()

torch.Size([768])

In [45]:
tokenizer.tokenize("location")

['location']

In [34]:
input_ids

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 31135, 3, 3, 1, 1, 1, 1, 1, 1, 1]

In [None]:
sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]])



sample_output = model_to_test(sample_tensor)
target_label = np.array(get_glove_vec(y,W_norm,vocab))

output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(vec_output.shape)

def __distance(W, vocab, ivocab, vec_output):


    dist = np.dot(W, vec_output.T).squeeze(1)
    print(dist.shape)
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[str(x)], dist[x]))
print(f"Test 1 -- sample sentence: \n\n{random_sent}\n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")

In [83]:
roberta_model = AutoModel.from_pretrained('roberta-base')
print(roberta_model)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop