Standardize for each experiment:
* What data is it run on
* What are the loss objective and metrics (cosine sim)
* Running epoch and results 
* Versioning of data and model (offline, use wandb-id)
    

In [1]:
"""
    Sample: 
        config.data = "wiki_data, word cats and pneumonia"
        config.loss = "L1"
        config.simscore = ""
        config.batch_size
        config.epoch
        config.lr
        config.momentum (since we are using SGD)
"""
print()




In [2]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date


In [3]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = 100
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
from util import *
import torch.optim as optim
from tqdm import tqdm

In [6]:

import random
#Loading the data
W_norm,vocab,ivocab = load_glove()
    
def negative_sample(samples,target,num=4):
    negative_samples = [x  for x in samples if x != target]
    return [target] + random.sample(samples,num)

negative_sample = False

if negative_sample == True:
    raw_word_list = ['location','position','site','land','place','city','district','area','leader','president','governor','mentor','director','command',
                    'authority','influence','teacher','cat','dog','whale','computer','shark','university','class','speak','cute']



    chosen_words = ['disease' for _ in range(5)]
    word_lists = [negative_sample(raw_word_list,w,4) for w in chosen_words]
else:
    word_lists = ['disease','pneumonia','vaccine','virus','sick']





In [7]:
word_lists

['disease', 'pneumonia', 'vaccine', 'virus', 'sick']

In [None]:
for corpus in ['giga']:
    wandb.init(project='Synthetic Net Data')
    config = wandb.config
    config.batch_size = 64
    model = DenseNet(context_length = 10)
    
    training_files = [f'../processed_data/{corpus}_only/{y}_corpus_stopwords_c10.txt' for y in word_lists]
    training_data = load_training_batch(training_files,config.batch_size)
    config.data = f'{corpus}_only_{"-".join(word_lists)}'
    print(config.data)

    config.lr = 0.0005
    config.momentum = 0.005
    optimizer = optim.SGD(model.parameters(),lr=config.lr,momentum=config.momentum,weight_decay=0.01)
    criterion = nn.L1Loss()

    def cosim(v1,v2):
        return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
    #learning rate adjustment -- try 0.001

    def train(model, iterator, optimizer, criterion):

        epoch_loss = 0
        epoch_acc = 0

        model.train()
        for batch in iterator:
            optimizer.zero_grad()
            features,labels = get_batch_embedding(batch,W_norm,vocab) 
        
            feat = torch.Tensor(features).to(device)
            label = torch.Tensor(labels).to(device)
            predictions = model(feat).squeeze(1)
            loss = criterion(predictions,label)      
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()

            cosim_score = np.mean([cosim(labels[i],predictions[i].cpu().detach().numpy()) for i in range(config.batch_size) ])

        return epoch_loss,cosim_score

    

    config.epochs = 40

    best_valid_loss = float('inf')

    for epoch in tqdm(range(config.epochs)):   
        train_loss,cosim_score= train(model.to(device),iter(training_data), optimizer, criterion)

        #epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        #if valid_loss < best_valid_loss:
         #   best_valid_loss = valid_loss
          #  torch.save(model.state_dict(), 'tut1-model.pt')

        #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        wandb.log({"loss":train_loss,"cosim_score":cosim_score})
        print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')


    torch.save(model.state_dict(),f'../outputs/{date.today().strftime("%Y-%m")}_{config.data}_{wandb.run.name}.pt')

wandb: wandb version 0.12.3 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


giga_only_disease-pneumonia-vaccine-virus-sick


  2%|██                                                                              | 1/40 [05:08<3:20:32, 308.52s/it]

Epoch:01	|	Train Loss: 837.671	|	Cosim score: 0.934


  5%|████                                                                            | 2/40 [10:11<3:13:29, 305.51s/it]

Epoch:02	|	Train Loss: 481.515	|	Cosim score: 0.984


  8%|██████                                                                          | 3/40 [15:15<3:07:45, 304.49s/it]

Epoch:03	|	Train Loss: 412.079	|	Cosim score: 0.994


 10%|████████                                                                        | 4/40 [20:19<3:02:34, 304.29s/it]

Epoch:04	|	Train Loss: 396.974	|	Cosim score: 0.997


 12%|██████████                                                                      | 5/40 [25:28<2:58:28, 305.96s/it]

Epoch:05	|	Train Loss: 388.916	|	Cosim score: 0.998


 15%|████████████                                                                    | 6/40 [30:32<2:53:05, 305.45s/it]

Epoch:06	|	Train Loss: 383.636	|	Cosim score: 0.999


 18%|██████████████                                                                  | 7/40 [35:37<2:47:58, 305.40s/it]

Epoch:07	|	Train Loss: 379.801	|	Cosim score: 0.999


 20%|████████████████                                                                | 8/40 [40:42<2:42:46, 305.21s/it]

Epoch:08	|	Train Loss: 376.942	|	Cosim score: 1.000


 22%|██████████████████                                                              | 9/40 [45:46<2:37:27, 304.75s/it]

Epoch:09	|	Train Loss: 374.755	|	Cosim score: 1.000


 25%|███████████████████▊                                                           | 10/40 [50:51<2:32:23, 304.77s/it]

Epoch:10	|	Train Loss: 373.036	|	Cosim score: 1.000


 28%|█████████████████████▋                                                         | 11/40 [55:55<2:27:17, 304.73s/it]

Epoch:11	|	Train Loss: 371.665	|	Cosim score: 1.000


 30%|███████████████████████                                                      | 12/40 [1:01:00<2:22:07, 304.56s/it]

Epoch:12	|	Train Loss: 370.561	|	Cosim score: 1.000


 32%|█████████████████████████                                                    | 13/40 [1:06:03<2:16:55, 304.27s/it]

Epoch:13	|	Train Loss: 369.665	|	Cosim score: 1.000


 35%|██████████████████████████▉                                                  | 14/40 [1:11:25<2:14:09, 309.62s/it]

Epoch:14	|	Train Loss: 368.930	|	Cosim score: 1.000


In [11]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
##Testing
#Test 1 sentence
#Test 1 batch
#Test all 

random_sent = random.choice(training_data)[random.randint(0,config.batch_size-1)]
y,x = random_sent.split(':')
x = re.sub('[\n\r\ ]+',' ',x).strip()
sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]])
sample_output = model(sample_tensor)
target_label = np.array(get_glove_vec(y,W_norm,vocab))

output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(vec_output.shape)

def __distance(W, vocab, ivocab, vec_output):


    dist = np.dot(W, vec_output.T).squeeze(1)
    print(dist.shape)
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[str(x)], dist[x]))
print(f"Test 1 -- sample sentence: \n\n{random_sent}\n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


In [None]:
random_batch = random.choice(training_data)
sample_batch_tensor = []
target_batch_tensor = []
for sentence in random_batch:
    y,x = random_sent.split(':')
    x = re.sub('[\n\r\ ]+',' ',x).strip()
    sample_tensor = [get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]
    target_batch_tensor.append(get_glove_vec(y,W_norm,vocab))
    sample_batch_tensor.append(sample_tensor)
    
sample_batch_tensor = torch.Tensor(np.array(sample_batch_tensor))
target_batch_tensor = np.array(target_batch_tensor)

sample_output = model(sample_batch_tensor)

output = torch.mean(sample_output,0)   #sum across embeddings
vec_output = output.detach().numpy().reshape((1,100))

print(f"Test 2 -- sample batch: \n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")

In [None]:
###Test 3: Custom

##Testing
#Test 1 sentence
#Test 1 batch
#Test all 


random_sent = 'pacific disaster response fund support armenian government fight spread covid year bank committed million loan electric networks armenia ensure electricity '
target_word = 'pneumonia'
target_label = np.array(get_glove_vec(target_word,W_norm,vocab))
random_sent = re.sub('[\n\r\ ]+',' ',random_sent).strip()

sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in random_sent.split(' ')]])
sample_output = model(sample_tensor)
output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(f"Test 3: Custom Test\n\n{random_sent}\n\n")
__distance(W_norm,vocab,ivocab,vec_output)
print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


In [None]:
sample_tensor.size()