Similar to the GloVe, the pre-processing stage (section 1 and 2) is largely the same. Hence we will assume that there are pre-processed text corpus in the ./processed_data folder ready. 

## <a class="anchor" id="s3">Section 3: Training with Pytorch and Wandb</a>

In [1]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date

wandb.init(project='Synthetic Net Roberta')
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: sosig_catto (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.3 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [2]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = embed_size
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    
model = DenseNet(context_length = 10,embed_size=768)
print(model)

DenseNet(
  (act): ReLU()
  (out): Tanh()
  (hidden1): Linear(in_features=15360, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=768, bias=True)
)


In [3]:
#setup roberta
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
checkpoint='roberta-base'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)
roberta_model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
vocab = tokenizer.vocab
roberta_embeddings = roberta_model.embeddings.word_embeddings.weight
special_token_ids = tokenizer.convert_tokens_to_ids([tokenizer.pad_token,tokenizer.unk_token])

In [10]:
from util import *

#Loading the data
    
config.batch_size = 64

target_words_list = ['tired'] # can be multiple words

training_files = [f'./processed_data/{x}_corpus_c10.txt' for x in target_words_list]
training_data = load_training_batch(training_files,config.batch_size)

def token_to_id(word):
    if word in vocab:
        return vocab[word]
    elif word == tokenizer.pad_token:
        return special_token_ids[0]
    else:
        return special_token_ids[1]
        
def get_roberta_embedding(train_data):
    train_tensor = []
    for i,batch in enumerate(train_data):
        batch_tensor_list = []
        batch_label_list = []
        for sentence in batch:
            try:
                y,x = sentence.split(':')
                s = re.sub('[\n\r\ ]+',' ',x).strip()
                input_ids = [token_to_id(word) for word in s.split(' ')]
                context_embeddings = roberta_embeddings[input_ids].detach()
                batch_tensor_list.append(context_embeddings)
                y_id = token_to_id(y)
                label_embedding = roberta_embeddings[y_id].detach()
                batch_label_list.append(label_embedding)
            except Exception as e:
                print(sentence)
                print(e)
                return
        train_tensor.append((batch_tensor_list,batch_label_list))
    return train_tensor

#list of tuples of tensors

#for debug
train_tensor = get_roberta_embedding(training_data[:1])
#for training, remove the slicing

In [12]:
print(len(train_tensor))

1


In [7]:
#hyper param
import torch.optim as optim


config.lr = 0.0005
config.momentum = 0.005
optimizer = optim.SGD(
    model.parameters(),
    lr=config.lr,momentum=config.momentum,
    weight_decay=0.01
)
criterion = nn.L1Loss()

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
#learning rate adjustment -- try 0.001



In [13]:
#training loop

from tqdm import tqdm

config.epochs = 1

best_valid_loss = float('inf')

for epoch in tqdm(range(config.epochs)):   

    epoch_loss = 0
    epoch_cosim = 0
    cosim_score = 0
    model.train()
  
    for batch in iter(train_tensor):
        optimizer.zero_grad()
        features,labels = batch
        feat = torch.stack(features)
        label = torch.stack(labels)
        
        predictions = model(feat).squeeze(1)
        loss = criterion(predictions,label)  
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        t_cosim_score = np.mean([cosim(label[i].detach().numpy(),predictions[i].detach().numpy()) for i in range(config.batch_size) ])
        cosim_score += t_cosim_score
    train_loss = epoch_loss
    cosim_score = cosim_score/len(train_tensor)
    wandb.log({"loss":train_loss,"cosim_score":cosim_score})
    print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.40it/s]

Epoch:01	|	Train Loss: 0.030	|	Cosim score: -0.089





In [None]:
torch.save(model.state_dict(),f'output/{date.today().strftime("%Y-%m")}_{config.data}_{wandb.run.name}.pt')
wandb.finish()

## <a class="anchor" id="s4">Section 4: Testing</a>

In [32]:
## section 4: Testing using analogy tests
model_to_test =  DenseNet(context_length = 10,embed_size=768)
model_to_test.load_state_dict(torch.load('./output/2021-10_roberta_capital_spring-dust-35.pt'))

<All keys matched successfully>

In [33]:
file = './processed_data/pointless_corpus_c10.txt'
with open(file,'r') as f:
    infer_data = f.read().split('\n')
random_batch = [x for x in infer_data if x != '']
random_batch = [x for x in infer_data if x != '']
context_embeddings = []
for sentence in random_batch:
    y,x = sentence.split(':')
    x = re.sub('[\n\r\ ]+',' ',x).strip()

    input_ids = [token_to_id(word) for word in x.split(' ')]
    context_embedding = roberta_embeddings[input_ids].detach()
    context_embeddings.append(context_embedding)
feat = torch.stack(context_embeddings)
output = model_to_test(feat)
vec_output = torch.mean(output,dim=0).detach().numpy().reshape(1,768)

roberta_embedding_numpy = roberta_embeddings.detach().numpy()

def __distance(W, vocab, ivocab, vec_output):

    
    dist = np.dot(W, vec_output.T).squeeze(1)
    dist = 1/(1+np.exp(-dist))
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[x], dist[x]))
print(f"Test 1 -- \n\n")

__distance(roberta_embedding_numpy,vocab,{v:k for k,v in vocab.items()},vec_output)

Test 1 -- 



                               Word       Unnormalized Cosine distance

---------------------------------------------------------

0                            capital		1.000000
1                            Capital		0.999988
2                           Ġcapital		0.999946
3                           ĠCapital		0.999749
4                          Ġcapitals		0.998047
5                         capitalist		0.996167
6                          resources		0.994834
7                          financial		0.992938
8                            liberal		0.992446
9                         industrial		0.991878


The Inference Pipeline is similar to testing, with slight modification in the network as shown above. The files that are crawled to ./inference will work for this notebook

### For modification of RoBERTa, please check the "Roberta Modification Demo" notebook