Standardize for each experiment:
* What data is it run on
* What are the loss objective and metrics (cosine sim)
* Running epoch and results 
* Versioning of data and model (offline, use wandb-id)
    

In [9]:
"""
    Sample: 
        config.data = "wiki_data, word cats and pneumonia"
        config.loss = "L1"
        config.simscore = ""
        config.batch_size
        config.epoch
        config.lr
        config.momentum (since we are using SGD)
"""
print()




In [10]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date

wandb.init(project='Synthetic Net Data')
config = wandb.config

wandb: wandb version 0.12.3 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [11]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = 100
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    
model = DenseNet(context_length = 10)
print(model)

DenseNet(
  (act): ReLU()
  (out): Tanh()
  (hidden1): Linear(in_features=2000, out_features=2048, bias=True)
  (hidden2): Linear(in_features=2048, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=100, bias=True)
)


In [13]:
from util import *

#Loading the data
W_norm,vocab,ivocab = load_glove()
    
config.batch_size = 64

corpus = 'giga'

training_files = [f'../processed_data/{corpus}_only/{x}_corpus_stopwords_c10.txt' for x in ['disease','pneumonia','vaccine','virus','sick']]
training_data = load_training_batch(training_files,config.batch_size)
#config.data = "wiki_only_5words-covid"

def debug_get_embedding(train_data,W_norm,vocab):
    train_tensor = []
    for i,batch in enumerate(train_data):
        tensor_list = []
        label_list = []
        for sentence in batch:
            try:
                y,x = sentence.split(':')
                s = re.sub('[\n\r\ ]+',' ',x).strip()
                tensor_list.append([get_glove_vec(word,W_norm,vocab) for word in s.split(' ')])
                label_list.append(get_glove_vec(y,W_norm,vocab))
            except:
                print(sentence)
        train_tensor.append((np.array(tensor_list),np.array(label_list)))
    return train_tensor


train_tensor = debug_get_embedding(training_data,W_norm,vocab)



disease:disease:<pad> <pad> <pad> <pad> <pad> <pad> also people worried r repeat sa movement <pad> <pad> <pad> <pad> <pad> <pad> <pad> 



In [5]:
import torch.optim as optim


config.lr = 0.0005
config.momentum = 0.005
optimizer = optim.SGD(model.parameters(),lr=config.lr,momentum=config.momentum,weight_decay=0.01)
criterion = nn.L1Loss()

def cosim(v1,v2):
    return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
#learning rate adjustment -- try 0.001

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        features,labels = batch
        batch_size = features.shape[0]
        predictions = model(torch.Tensor(features)).squeeze(1)
        loss = criterion(predictions,torch.Tensor(labels))      
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
        cosim_score = np.mean([cosim(labels[i],predictions[i].detach().numpy()) for i in range(batch_size) ])
        
    return epoch_loss,cosim_score

In [6]:
from tqdm import tqdm

config.epochs = 50

best_valid_loss = float('inf')

for epoch in tqdm(range(config.epochs)):   
    train_loss,cosim_score= train(model,iter(train_tensor), optimizer, criterion)

    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
     #   best_valid_loss = valid_loss
      #  torch.save(model.state_dict(), 'tut1-model.pt')
    
    #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    wandb.log({"loss":train_loss,"cosim_score":cosim_score})
    print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')


  2%|█▌                                                                              | 1/50 [01:46<1:26:36, 106.05s/it]

Epoch:01	|	Train Loss: 271.078	|	Cosim score: 0.426


  4%|███▏                                                                            | 2/50 [03:33<1:25:22, 106.72s/it]

Epoch:02	|	Train Loss: 223.222	|	Cosim score: 0.761


  4%|███▏                                                                            | 2/50 [03:40<1:28:00, 110.01s/it]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(),f'../outputs/{date.today().strftime("%Y-%m")}_{config.data}_{wandb.run.name}.pt')

In [16]:
model_to_test =  DenseNet(context_length = 10)
model_to_test.load_state_dict(torch.load('../outputs/2021-08_wiki_only_pneumonia_winter-firefly-6.pt'))

<All keys matched successfully>

In [17]:
##Testing
#Test 1 sentence
#Test 1 batch
#Test all 

random_sent = random.choice(training_data)[random.randint(0,config.batch_size-1)]
y,x = random_sent.split(':')
x = re.sub('[\n\r\ ]+',' ',x).strip()
sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]])
sample_output = model_to_test(sample_tensor)
target_label = np.array(get_glove_vec(y,W_norm,vocab))

output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(vec_output.shape)

def __distance(W, vocab, ivocab, vec_output):


    dist = np.dot(W, vec_output.T).squeeze(1)
    print(dist.shape)
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[str(x)], dist[x]))
print(f"Test 1 -- sample sentence: \n\n{random_sent}\n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


(1, 100)
Test 1 -- sample sentence: 

disease:<pad> <pad> <pad> deborah dainton suffers limp result polio treatment child left claustrophobic reclusive large crowds rigid controlled life transformed 



(400000,)

                               Word       Unnormalized Cosine distance

---------------------------------------------------------

0                          pneumonia		0.513979
1                         bronchitis		0.422000
2                          infection		0.391513
3                        respiratory		0.381901
4                            illness		0.379707
5                         infections		0.376967
6                            typhoid		0.373854
7                              fever		0.373324
8                       tuberculosis		0.371408
9                           atypical		0.363049


		Cosim score: [0.58541959]


In [None]:
random_batch = random.choice(training_data)
sample_batch_tensor = []
target_batch_tensor = []
for sentence in random_batch:
    y,x = random_sent.split(':')
    x = re.sub('[\n\r\ ]+',' ',x).strip()
    sample_tensor = [get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]
    target_batch_tensor.append(get_glove_vec(y,W_norm,vocab))
    sample_batch_tensor.append(sample_tensor)
    
sample_batch_tensor = torch.Tensor(np.array(sample_batch_tensor))
target_batch_tensor = np.array(target_batch_tensor)

sample_output = model(sample_batch_tensor)

output = torch.mean(sample_output,0)   #sum across embeddings
vec_output = output.detach().numpy().reshape((1,100))

print(f"Test 2 -- sample batch: \n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")

In [None]:
###Test 3: Custom

##Testing
#Test 1 sentence
#Test 1 batch
#Test all 


random_sent = 'pacific disaster response fund support armenian government fight spread covid year bank committed million loan electric networks armenia ensure electricity '
target_word = 'pneumonia'
target_label = np.array(get_glove_vec(target_word,W_norm,vocab))
random_sent = re.sub('[\n\r\ ]+',' ',random_sent).strip()

sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in random_sent.split(' ')]])
sample_output = model(sample_tensor)
output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(f"Test 3: Custom Test\n\n{random_sent}\n\n")
__distance(W_norm,vocab,ivocab,vec_output)
print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


In [None]:
sample_tensor.size()