In [1]:
"""
    Sample: 
        config.data = "wiki_data, word cats and pneumonia"
        config.loss = "L1"
        config.simscore = ""
        config.batch_size
        config.epoch
        config.lr
        config.momentum (since we are using SGD)
"""
print()




In [2]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date

wandb.init(project='Synthetic Net Roberta')
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: sosig_catto (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.3 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [3]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = embed_size
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    
synmodel = DenseNet(context_length = 10,embed_size=768)
print(synmodel)

DenseNet(
  (act): ReLU()
  (out): Tanh()
  (hidden1): Linear(in_features=15360, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=768, bias=True)
)


In [4]:
#setup roberta
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
checkpoint='roberta-base'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
vocab = tokenizer.vocab
roberta_embeddings = model.embeddings.word_embeddings.weight
special_token_ids = tokenizer.convert_tokens_to_ids([tokenizer.pad_token,tokenizer.unk_token])

In [6]:
roberta_embeddings[[1,2,3]].size()

torch.Size([3, 768])

In [7]:
from util import *

#Loading the data
    
config.batch_size = 64

corpus = 'giga'

training_files = [f'../processed_data/{corpus}_only/{x}_corpus_stopwords_c10.txt' for x in ['disease','pneumonia','vaccine','virus','sick']]
training_data = load_training_batch(training_files,config.batch_size)
#config.data = "wiki_only_5words-covid"
def token_to_id(word):
    if word in vocab:
        return vocab[word]
    elif word == tokenizer.pad_token:
        return special_token_ids[0]
    else:
        return special_token_ids[1]
        
def get_roberta_embedding(train_data):
    train_tensor = []
    for i,batch in enumerate(train_data):
        batch_tensor_list = []
        batch_label_list = []
        for sentence in batch:
            try:
                y,x = sentence.split(':')
                s = re.sub('[\n\r\ ]+',' ',x).strip()
                input_ids = [token_to_id(word) for word in s.split(' ')]
                context_embeddings = roberta_embeddings[input_ids]
                batch_tensor_list.append(context_embeddings)
                y_id = token_to_id(y)
                label_embedding = roberta_embeddings[y_id]
                batch_label_list.append(label_embedding)
            except Exception as e:
                print(sentence)
                print(e)
                return
        train_tensor.append((torch.stack(batch_tensor_list),torch.stack(batch_label_list)))
    return train_tensor

#list of tuples of tensors

train_tensor = get_roberta_embedding(training_data[:1])

In [8]:
print(len(train_tensor))   #--num batch
print(len(train_tensor[0]))   #tuple
print(train_tensor[0][0].size())  #input_ids
print(train_tensor[0][1].size())  #label_ids
print(train_tensor[0][0][0].size())  #input_ids
print(train_tensor[0][1][0].size())   #batch 0, label, first_eg of that batch 's label
print(train_tensor[0][0][0][0].size()) #batch 0, input_ids, first_eg of that batch -- 20 tensors, choose the first one


1
2
torch.Size([64, 20, 768])
torch.Size([64, 768])
torch.Size([20, 768])
torch.Size([768])
torch.Size([768])


In [9]:
import torch.optim as optim


config.lr = 0.0005
config.momentum = 0.005
optimizer = optim.SGD(model.parameters(),lr=config.lr,momentum=config.momentum,weight_decay=0.01)
criterion = nn.L1Loss()

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
#learning rate adjustment -- try 0.001

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        features,labels = batch
        feat = features
        label = labels
        print(feat.size())
        print(label.size())
        predictions = model(feat).squeeze(1)
        loss = criterion(predictions,label)  
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
        cosim_score = np.mean([cosim(labels[i].detach().numpy(),predictions[i].detach().numpy()) for i in range(config.batch_size) ])
        
    return epoch_loss,cosim_score

In [10]:
from tqdm import tqdm

config.epochs = 1

best_valid_loss = float('inf')

for epoch in tqdm(range(config.epochs)):   
    train_loss,cosim_score= train(synmodel,iter(train_tensor), optimizer, criterion)

    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
     #   best_valid_loss = valid_loss
      #  torch.save(model.state_dict(), 'tut1-model.pt')
    
    #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    wandb.log({"loss":train_loss,"cosim_score":cosim_score})
    print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

torch.Size([64, 20, 768])
torch.Size([64, 768])


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.84s/it]

Epoch:01	|	Train Loss: 0.029	|	Cosim score: -0.005





In [11]:
import os
files_available = [x[:x.find('_')] for x in os.listdir(f'../processed_data/{corpus}_only/') if x.endswith('.txt')]

In [15]:
def negative_sample(target,files,n=4):
    negatives = [x for x in files if x != target]
    return [target] + random.sample(negatives,n)

d = {x:negative_sample(x,files_available,2) for x in ['disease','pneumonia','sick']}

In [16]:
d

{'disease': ['disease', 'pneumonia', 'vaccine'],
 'pneumonia': ['pneumonia', 'virus', 'sick'],
 'sick': ['sick', 'pneumonia', 'disease']}