Standardize for each experiment:
* What data is it run on
* What are the loss objective and metrics (cosine sim)
* Running epoch and results 
* Versioning of data and model (offline, use wandb-id)
    

In [1]:
"""
    Sample: 
        config.data = "wiki_data, word cats and pneumonia"
        config.loss = "L1"
        config.simscore = ""
        config.batch_size
        config.epoch
        config.lr
        config.momentum (since we are using SGD)
"""
print()




In [2]:
import json
import numpy as np
import wandb
import re
import random
from datetime import date


In [3]:
import torch
import torch.nn as nn

class DenseNet(nn.Module):
    def __init__(self,context_length,embed_size=100):
        super().__init__()
        self.n = context_length*2
        self.embed_size = 100
        self.act = nn.ReLU()
        self.out = nn.Tanh() 
        self.hidden1 = nn.Linear(self.n*self.embed_size,2048)
        self.hidden2 = nn.Linear(2048,512)
        self.hidden3 = nn.Linear(512,self.embed_size)
 
    def forward(self,x):
        x = x.view(x.size(0), -1)
        x = self.act(self.hidden1(x))
        x = self.act(self.hidden2(x))
        x = self.out(self.hidden3(x))
        return x
    
    

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
from util import *
import torch.optim as optim
from tqdm import tqdm

In [5]:

import random
#Loading the data
W_norm,vocab,ivocab = load_glove()
    
def negative_sample(samples,target,num=4):
    negative_samples = [x  for x in samples if x != target]
    return [target] + random.sample(samples,num)

raw_word_list = ['location','position','site','land','place','city','district','area','leader','president','governor','mentor','director','command',
                'authority','influence','teacher','cat','dog','whale','computer','shark','university','class','speak','cute']

chosen_words = ['disease' for _ in range(5)]
word_lists = [negative_sample(raw_word_list,w,4) for w in chosen_words]





In [6]:
word_lists

[['disease', 'president', 'teacher', 'land', 'shark'],
 ['disease', 'area', 'class', 'governor', 'place'],
 ['disease', 'teacher', 'location', 'position', 'speak'],
 ['disease', 'director', 'position', 'district', 'computer'],
 ['disease', 'command', 'area', 'position', 'district']]

In [None]:
for x in word_lists:
    wandb.init(project='Synthetic Net')
    config = wandb.config
    config.batch_size = 64
    model = DenseNet(context_length = 10)
    
    training_files = [f'../processed_data/wiki_only/{y}_corpus_stopwords_c10.txt' for y in x]
    training_data = load_training_batch(training_files,config.batch_size)
    config.data = f'wiki_only_{x}'
    

    config.lr = 0.0005
    config.momentum = 0.005
    optimizer = optim.SGD(model.parameters(),lr=config.lr,momentum=config.momentum,weight_decay=0.01)
    criterion = nn.L1Loss()

    def cosim(v1,v2):
        return np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, debug_set.shape[0], eta_min=config.lr)
    #learning rate adjustment -- try 0.001

    def train(model, iterator, optimizer, criterion):

        epoch_loss = 0
        epoch_acc = 0

        model.train()
        for batch in iterator:
            optimizer.zero_grad()
            features,labels = get_batch_embedding(batch,W_norm,vocab) 
        
            feat = torch.Tensor(features).to(device)
            label = torch.Tensor(labels).to(device)
            predictions = model(feat).squeeze(1)
            loss = criterion(predictions,label)      
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()

            cosim_score = np.mean([cosim(labels[i],predictions[i].cpu().detach().numpy()) for i in range(config.batch_size) ])

        return epoch_loss,cosim_score

    

    config.epochs = 40

    best_valid_loss = float('inf')

    for epoch in tqdm(range(config.epochs)):   
        train_loss,cosim_score= train(model.to(device),iter(training_data), optimizer, criterion)

        #epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        #if valid_loss < best_valid_loss:
         #   best_valid_loss = valid_loss
          #  torch.save(model.state_dict(), 'tut1-model.pt')

        #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        wandb.log({"loss":train_loss,"cosim_score":cosim_score})
        print(f'Epoch:{epoch+1:02}\t|\tTrain Loss: {train_loss:.3f}\t|\tCosim score: {cosim_score:.3f}')


    torch.save(model.state_dict(),f'../outputs/{date.today().strftime("%Y-%m")}_{config.data}_{wandb.run.name}.pt')

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

wandb: wandb version 0.12.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


  2%|██                                                                              | 1/40 [08:21<5:25:44, 501.14s/it]

Epoch:01	|	Train Loss: 1468.736	|	Cosim score: 0.823


  5%|████                                                                            | 2/40 [16:40<5:16:50, 500.27s/it]

Epoch:02	|	Train Loss: 1241.317	|	Cosim score: 0.853


  8%|██████                                                                          | 3/40 [25:03<5:09:13, 501.44s/it]

Epoch:03	|	Train Loss: 1205.325	|	Cosim score: 0.857


 10%|████████                                                                        | 4/40 [33:34<5:02:59, 504.99s/it]

Epoch:04	|	Train Loss: 1191.590	|	Cosim score: 0.857


 12%|██████████                                                                      | 5/40 [42:06<4:56:02, 507.51s/it]

Epoch:05	|	Train Loss: 1184.229	|	Cosim score: 0.856


 15%|████████████                                                                    | 6/40 [50:37<4:48:21, 508.87s/it]

Epoch:06	|	Train Loss: 1179.821	|	Cosim score: 0.856


 18%|██████████████                                                                  | 7/40 [59:07<4:40:06, 509.29s/it]

Epoch:07	|	Train Loss: 1177.130	|	Cosim score: 0.855


 20%|███████████████▌                                                              | 8/40 [1:07:37<4:31:45, 509.54s/it]

Epoch:08	|	Train Loss: 1175.505	|	Cosim score: 0.855


 22%|█████████████████▌                                                            | 9/40 [1:16:06<4:23:11, 509.39s/it]

Epoch:09	|	Train Loss: 1174.523	|	Cosim score: 0.855


 25%|███████████████████▎                                                         | 10/40 [1:24:54<4:17:28, 514.95s/it]

Epoch:10	|	Train Loss: 1173.938	|	Cosim score: 0.854


 28%|█████████████████████▏                                                       | 11/40 [1:34:33<4:18:21, 534.53s/it]

Epoch:11	|	Train Loss: 1173.608	|	Cosim score: 0.854


 30%|███████████████████████                                                      | 12/40 [1:45:12<4:24:17, 566.34s/it]

Epoch:12	|	Train Loss: 1173.435	|	Cosim score: 0.853


 32%|█████████████████████████                                                    | 13/40 [1:55:12<4:19:27, 576.58s/it]

Epoch:13	|	Train Loss: 1173.355	|	Cosim score: 0.853


 35%|██████████████████████████▉                                                  | 14/40 [2:05:43<4:17:00, 593.08s/it]

Epoch:14	|	Train Loss: 1173.331	|	Cosim score: 0.852


 38%|████████████████████████████▉                                                | 15/40 [2:15:25<4:05:45, 589.82s/it]

Epoch:15	|	Train Loss: 1173.339	|	Cosim score: 0.852


 40%|██████████████████████████████▊                                              | 16/40 [2:24:44<3:52:08, 580.34s/it]

Epoch:16	|	Train Loss: 1173.365	|	Cosim score: 0.851


 42%|████████████████████████████████▋                                            | 17/40 [2:33:59<3:39:37, 572.92s/it]

Epoch:17	|	Train Loss: 1173.399	|	Cosim score: 0.851


 45%|██████████████████████████████████▋                                          | 18/40 [2:43:11<3:27:42, 566.50s/it]

Epoch:18	|	Train Loss: 1173.438	|	Cosim score: 0.851


wandb: Network error resolved after 0:00:27.948280, resuming normal operation.
 48%|████████████████████████████████████▌                                        | 19/40 [2:52:46<3:19:11, 569.12s/it]

Epoch:19	|	Train Loss: 1173.478	|	Cosim score: 0.850


 50%|██████████████████████████████████████▌                                      | 20/40 [3:02:05<3:08:40, 566.02s/it]

Epoch:20	|	Train Loss: 1173.517	|	Cosim score: 0.850


 52%|████████████████████████████████████████▍                                    | 21/40 [3:11:20<2:58:13, 562.83s/it]

Epoch:21	|	Train Loss: 1173.551	|	Cosim score: 0.850


 55%|██████████████████████████████████████████▎                                  | 22/40 [3:20:33<2:47:57, 559.88s/it]

Epoch:22	|	Train Loss: 1173.581	|	Cosim score: 0.850


 57%|████████████████████████████████████████████▎                                | 23/40 [3:29:42<2:37:41, 556.57s/it]

Epoch:23	|	Train Loss: 1173.607	|	Cosim score: 0.850


 60%|██████████████████████████████████████████████▏                              | 24/40 [3:39:03<2:28:47, 557.97s/it]

Epoch:24	|	Train Loss: 1173.630	|	Cosim score: 0.849


 62%|████████████████████████████████████████████████▏                            | 25/40 [3:48:22<2:19:32, 558.16s/it]

Epoch:25	|	Train Loss: 1173.650	|	Cosim score: 0.849


 65%|██████████████████████████████████████████████████                           | 26/40 [3:57:35<2:09:54, 556.75s/it]

Epoch:26	|	Train Loss: 1173.668	|	Cosim score: 0.849


 68%|███████████████████████████████████████████████████▉                         | 27/40 [4:06:16<1:58:17, 545.95s/it]

Epoch:27	|	Train Loss: 1173.683	|	Cosim score: 0.849


 70%|█████████████████████████████████████████████████████▉                       | 28/40 [4:14:56<1:47:37, 538.13s/it]

Epoch:28	|	Train Loss: 1173.696	|	Cosim score: 0.849


 72%|███████████████████████████████████████████████████████▊                     | 29/40 [4:23:36<1:37:37, 532.52s/it]

Epoch:29	|	Train Loss: 1173.710	|	Cosim score: 0.849


 75%|█████████████████████████████████████████████████████████▊                   | 30/40 [4:32:18<1:28:14, 529.45s/it]

Epoch:30	|	Train Loss: 1173.722	|	Cosim score: 0.849


 78%|███████████████████████████████████████████████████████████▋                 | 31/40 [4:40:58<1:19:01, 526.80s/it]

Epoch:31	|	Train Loss: 1173.735	|	Cosim score: 0.849


 80%|█████████████████████████████████████████████████████████████▌               | 32/40 [4:49:39<1:09:58, 524.80s/it]

Epoch:32	|	Train Loss: 1173.745	|	Cosim score: 0.849


 82%|███████████████████████████████████████████████████████████████▌             | 33/40 [4:58:19<1:01:03, 523.38s/it]

Epoch:33	|	Train Loss: 1173.756	|	Cosim score: 0.849


 85%|███████████████████████████████████████████████████████████████████▏           | 34/40 [5:06:59<52:15, 522.59s/it]

Epoch:34	|	Train Loss: 1173.767	|	Cosim score: 0.849


 88%|█████████████████████████████████████████████████████████████████████▏         | 35/40 [5:16:14<44:21, 532.29s/it]

Epoch:35	|	Train Loss: 1173.778	|	Cosim score: 0.849


 90%|███████████████████████████████████████████████████████████████████████        | 36/40 [5:25:26<35:53, 538.25s/it]

Epoch:36	|	Train Loss: 1173.789	|	Cosim score: 0.849


 92%|█████████████████████████████████████████████████████████████████████████      | 37/40 [5:34:57<27:23, 547.83s/it]

Epoch:37	|	Train Loss: 1173.798	|	Cosim score: 0.849


In [None]:
##Testing
#Test 1 sentence
#Test 1 batch
#Test all 

random_sent = random.choice(training_data)[random.randint(0,config.batch_size-1)]
y,x = random_sent.split(':')
x = re.sub('[\n\r\ ]+',' ',x).strip()
sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]])
sample_output = model(sample_tensor)
target_label = np.array(get_glove_vec(y,W_norm,vocab))

output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(vec_output.shape)

def __distance(W, vocab, ivocab, vec_output):


    dist = np.dot(W, vec_output.T).squeeze(1)
    print(dist.shape)
    a = np.argsort(-dist)[:10]

    print("\n                               Word       Unnormalized Cosine distance\n")
    print("---------------------------------------------------------\n")
    for i,x in enumerate(a):
        print("%d%35s\t\t%f" % (i,ivocab[str(x)], dist[x]))
print(f"Test 1 -- sample sentence: \n\n{random_sent}\n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


In [None]:
random_batch = random.choice(training_data)
sample_batch_tensor = []
target_batch_tensor = []
for sentence in random_batch:
    y,x = random_sent.split(':')
    x = re.sub('[\n\r\ ]+',' ',x).strip()
    sample_tensor = [get_glove_vec(word,W_norm,vocab) for word in x.split(' ')]
    target_batch_tensor.append(get_glove_vec(y,W_norm,vocab))
    sample_batch_tensor.append(sample_tensor)
    
sample_batch_tensor = torch.Tensor(np.array(sample_batch_tensor))
target_batch_tensor = np.array(target_batch_tensor)

sample_output = model(sample_batch_tensor)

output = torch.mean(sample_output,0)   #sum across embeddings
vec_output = output.detach().numpy().reshape((1,100))

print(f"Test 2 -- sample batch: \n\n")

__distance(W_norm,vocab,ivocab,vec_output)


print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")

In [None]:
###Test 3: Custom

##Testing
#Test 1 sentence
#Test 1 batch
#Test all 


random_sent = 'pacific disaster response fund support armenian government fight spread covid year bank committed million loan electric networks armenia ensure electricity '
target_word = 'pneumonia'
target_label = np.array(get_glove_vec(target_word,W_norm,vocab))
random_sent = re.sub('[\n\r\ ]+',' ',random_sent).strip()

sample_tensor = torch.Tensor([[get_glove_vec(word,W_norm,vocab) for word in random_sent.split(' ')]])
sample_output = model(sample_tensor)
output = sample_output.squeeze(1)
vec_output = output.detach().numpy()
print(f"Test 3: Custom Test\n\n{random_sent}\n\n")
__distance(W_norm,vocab,ivocab,vec_output)
print(f"\n\n\t\tCosim score: {cosim(vec_output,target_label)}")


In [None]:
sample_tensor.size()