In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel

In [38]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base",output_hidden_states = True)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
test_sent = "pfizer is working well"
print(tokenizer.tokenize(test_sent)) #check the tokenize

print(tokenizer(test_sent))
len_ids = len(tokenizer(test_sent)['input_ids'])

['p', 'f', 'izer', 'Ġis', 'Ġworking', 'Ġwell']
{'input_ids': [0, 642, 506, 6315, 16, 447, 157, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [34]:
test_sent = "pneumonia"
print(tokenizer.tokenize(test_sent)) #check the tokenize

print(tokenizer(test_sent))
len_ids = len(tokenizer(test_sent)['input_ids'])

['p', 'neum', 'onia']
{'input_ids': [0, 642, 45042, 15402, 2], 'attention_mask': [1, 1, 1, 1, 1]}


In [15]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date

wandb.init(project='Roberta SynNet Prototype')
config = wandb.config

wandb: Currently logged in as: sosig_catto (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.0 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [39]:
import torch
import torch.nn as nn

class DenseRobertaNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = embed_size
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,1024)
        self.hidden3 = nn.Linear(1024,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    
synmodel = DenseRobertaNet(context_length = 10,embed_size = 768)
print(synmodel)

DenseRobertaNet(
  (act): ReLU()
  (out): Tanh()
  (hidden1): Linear(in_features=15360, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=1024, bias=True)
  (hidden3): Linear(in_features=1024, out_features=768, bias=True)
)


In [21]:
from util import *

#Loading the data
W_norm,vocab,ivocab = load_glove()
    
config.batch_size = 64

training_files = '../processed_data/wiki_only/infection_corpus_stopwords_c10.txt' 
training_data = load_training_batch(training_files,config.batch_size)
config.data = "wiki_only_infection"

train_tensor = []
for i,batch in enumerate(training_data):
    data_list = []
    label_list = []
    for sentence in batch:
        y,x = sentence.split(':')
        s = re.sub('[\n\r\ ]+',' ',x).strip()
        data_list.append(s)
        label_list.append(y)
    train_tensor.append((data_list,label_list))


In [26]:
test_tensor = train_tensor[0]

In [52]:
def get_roberta_embedding(word):  #data is list of string
    ids = tokenizer(s)["input_ids"]
    len_ids = len(ids)
    with torch.no_grad():
        outputs = model(torch.tensor([ids]))
        hidden_states = outputs[2]

    token_vecs_sum = []
    for i in range(len_ids):
        word_embed = hidden_states[-3][0][i] + hidden_states[-2][0][i] + hidden_states[-1][0][i]
        token_vecs_sum.append(word_embed.detach().numpy())
    word_vec = np.sum(token_vecs_sum,axis = 0)
    return word_vec

In [59]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [91]:
import torch.optim as optim


config.lr = 0.0005
config.momentum = 0.005
optimizer = optim.SGD(model.parameters(),lr=config.lr,momentum=config.momentum,weight_decay=0.01)
criterion = nn.L1Loss()

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
#learning rate adjustment -- try 0.001

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        data,y = batch
        
        features = []
        labels = []
        #get the embedding for the word as well as the sentence
        for i,s in enumerate(data):
            s_x = s.split(' ')
            features.append([get_roberta_embedding(x) for x in s_x])
            labels.append(get_roberta_embedding(y[i]))
        #print(len(features),len(labels))
        
        feat = torch.Tensor(features).to(device)
        label = torch.Tensor(labels).to(device)
        
        predictions = model(feat).squeeze(1)
        loss = criterion(predictions,label)      
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
        cosim_score = np.mean([cosim(labels[i],predictions[i].cpu().detach().numpy()) for i in range(config.batch_size) ])
        
    return epoch_loss, cosim_score

In [92]:
debugsor = train_tensor[:8]

In [93]:
from tqdm import tqdm

config.epochs = 2

best_valid_loss = float('inf')

for epoch in tqdm(range(config.epochs)):   
    train_loss,score= train(synmodel.to(device),iter(debugsor), optimizer, criterion)

    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
     #   best_valid_loss = valid_loss
      #  torch.save(model.state_dict(), 'tut1-model.pt')
    
    #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    wandb.log({"loss":train_loss,"cosim_score":score})
    print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {score:.3f}')


 50%|█████████████████████████████████████████▌                                         | 1/2 [07:44<07:44, 464.61s/it]

Epoch:01	|	Train Loss: 100.676	|	Cosim score: -0.060


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [15:29<00:00, 464.73s/it]

Epoch:02	|	Train Loss: 101.019	|	Cosim score: -0.060



wandb: Network error resolved after 0:00:38.062610, resuming normal operation.
