In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/My Drive/NLP2_DGM"

# imports
import os
import numpy as np
import pandas as pd
import tkinter

# Preprocessing
from preprocessing import AFFRDataset, get_data, padded_collate

# All things torch-y
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim import Adam

# To parse dem arguments
import argparse

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You're running on:", device)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/NLP2_DGM
You're running on: cuda


In [0]:
# Own classes
from helpers import generate_text
from RNNLM import RNNLM

def train_rnnlm(config, train_data, valid_data, tokenizer):
    """
    Args:
        config    : Argparse object (turned dictionary) containing all parameters
        train_data: Fold with training data
        valid_data: Fold with validation data
        tokenizer : Object to tokenize words with
    """

    # Initialize the device which to run the model on
    if(torch.cuda.is_available()):
        device = torch.device("cuda")
        print('Device = CUDA')
    else:
        device = torch.device("cpu")
        print('Device = CPU')
        
    #Paths to save the model and optimizer to
    modelpath = config['model_path']
    optimpath = config['optim_path']

    # Initialize the model that we are going to use
    makeNew = config['new_model']

    #Load in model if necessary
    if(not makeNew and modelpath != ""):
        model = (torch.load(modelpath))
    else:
        model = RNNLM(config['vocab_size'],config['embedding_size'],config['num_hidden']).to(device)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0,reduction='sum')
    optimizer = optim.Adam(model.parameters(),config['learning_rate'])

    #Load in the optimizer if necessary
    if(not makeNew and optimpath != ""):
        optimizer.load_state_dict(torch.load(optimpath))

    losses = []

    for epoch in range(config['num_epochs']):
        print("Epoch: " + str(epoch))
        loss = 0
        model.train()
        for step, (batch_inputs, batch_targets, _) in enumerate(train_data):
            optimizer.zero_grad()
            curr_loss = calc_loss(model, criterion, batch_inputs, batch_targets, device)
            loss += curr_loss.item()
            curr_loss.backward()
            optimizer.step()

        print("Epoch {:04d}, Batch Size = {}, Avg. Loss = {:.3f}".format(epoch, config['batch_size'], loss/step))
        losses.append(loss/step)
        loss = 0

        #Generate text
        model.eval()
        with torch.no_grad():
            text = generate_text(model,device, tokenizer,config['sample_strat'],config['sample_temp'])
            print(text)

        '''                if(modelpath != ""):
            torch.save(model,modelpath)
        if(optimpath != ""):
            torch.save(optimizer.state_dict(),optimpath)'''

        for step, (batch_inputs, batch_targets, _) in enumerate(valid_data):
            curr_loss = calc_loss(model, criterion, batch_inputs, batch_targets, device)
            loss += curr_loss.item()

        print("Epoch {:04d}, Validation, Avg. Loss = {:.3f}".format(epoch, loss/step))

    print('Done training.')
    print(losses)

def calc_loss(model, criterion, batch_inputs, batch_targets, device):
    targets = batch_targets.to(device)
    out,_ = model(batch_inputs.to(device))

    seq_length = batch_inputs.shape[1]
    batch_size = batch_inputs.shape[0]
    curr_loss = criterion(out.view(batch_size*seq_length,-1),targets.view(-1))
    curr_loss /= batch_size
    return curr_loss

In [0]:
# Get our Sentence VAE
from SentenceVAE import SentenceVAE

# Own classes and helpers
from helpers import save_plot, save_model
from perplexity import perplexity

def train_VAE(train_loader, 
                      valid_loader, 
                      test_loader,
                      config):
    """
    Function to train our Sentence VAE

    Args:
        train_loader: Loader for training data
        valid_loader: Loader for validation data
        test_loader : Loader for testing data
        config      : Dictionary containing all parameters
    """

    # Get necessary parameters from config
    device = config['device']
    epochs = config['num_epochs']
    vocab_size = config['vocab_size']
    embed_size = config['embedding_size']
    hidden_size = config['num_hidden']
    zdim = config['z_dim']
    tokenizer  = config['tokenizer']

    # Instantiate model
    model = SentenceVAE(vocab_size, config, embed_size, hidden_size, zdim) 
        
    print("Is this still cuda?: ", device)
    model = model.to(device)
    # sample = model.sample(device=device, sampling_strat='rand', tokenizer = tokenizer)
    # print(sample)

    # Optimizer and statistics
    optimizer = Adam(model.parameters())
    train_curve, val_curve = [], []
    train_kl_curve, val_kl_curve = [], []
    print_telbo, print_velbo, print_tkl, print_vkl = [], [], [], []

    for epoch in range(epochs):
        print('Epoch', epoch)
        elbos, KLs = run_epoch(model, (train_loader, valid_loader), optimizer, device)
        perplex = perplexity(model,valid_loader,device)
        train_elbo, val_elbo = elbos
        train_kl, val_kl = KLs
        train_curve.append(train_elbo)
        val_curve.append(val_elbo)
        train_kl_curve.append(train_kl)
        val_kl_curve.append(val_kl)
        print("[Epoch {}] train neg elbo: {} train KL: {}, val neg elbo: {} val kl: {}, perplexity: {}".format(epoch,train_elbo,train_kl,val_elbo,val_kl, perplex))
        sample = model.sample(device=device, sampling_strat='rand', tokenizer = tokenizer)
        print(sample)
        print_telbo.append(train_elbo.item())
        print_velbo.append(val_elbo.item())
        print_tkl.append(train_kl.item())
        print_vkl.append(val_kl.item())

    # Save ELBO and KL plot and save the model
    save_plot(train_curve, val_curve, epoch, config)
    save_plot(train_kl_curve, val_kl_curve, epoch, config, True)
    save_model(model, config)
    print("Train ELBO:")
    print(print_telbo)
    print("Validation ELBO:")
    print(print_velbo)
    print("Train KL:")
    print(print_tkl)
    print("Validation KL:")
    print(print_vkl)

def epoch_iter(model, data, optimizer, device):
    """
    Perform a single epoch for either the training or validation.
    use model.training to determine if in 'training mode' or not.

    Returns the average elbo for the complete epoch.
    """
    average_epoch_elbo = None
    total_elbo = 0
    total_KL = 0
    iterations = 0
    if(model.training):
        for step, (inputs, targets, lengths) in enumerate(data):
            optimizer.zero_grad()
            batch_elbo, batch_KL = model(inputs.to(device), targets.to(device), torch.tensor(lengths).to(device), device)
            batch_elbo.backward()
            optimizer.step()
            iterations = step
            total_elbo += batch_elbo.detach()
            total_KL += torch.mean(batch_KL.detach())
    else:
        for step, (inputs, targets, lengths) in enumerate(data):
            with torch.no_grad():
                batch_elbo, batch_KL = model(inputs.to(device), targets.to(device), torch.tensor(lengths).to(device), device)
                iterations = step
                total_elbo += batch_elbo.detach()
                total_KL += torch.mean(batch_KL.detach())
    average_epoch_elbo = total_elbo/iterations
    average_epoch_KL = total_KL/iterations
    return average_epoch_elbo, average_epoch_KL

def run_epoch(model, data, optimizer, device):
    """
    Run a train and validation epoch and return average elbo for each.
    """
    traindata, valdata = data

    model.train()
    train_elbo, train_KL = epoch_iter(model, traindata, optimizer, device)

    model.eval()
    val_elbo, val_KL = epoch_iter(model, valdata, optimizer, device)
    
    return (train_elbo, val_elbo), (train_KL, val_KL)

In [4]:
# Get datasets
print("Preparing data and tokenizer...")
train_data, validation_data, test_data, tokenizer = get_data()

Preparing data and tokenizer...


NameError: ignored

In [0]:
# For correct argument parsing
def str2bool(arg):
    if isinstance(arg, bool):
       return arg
    if arg.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif arg.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

# Initialize argument parser
parser = argparse.ArgumentParser()

# Model selection, device selection
parser.add_argument('--model', type=str, default="vae",
                    help='Select model to use')
parser.add_argument('--device', type=str, default=device,
                    help='Select which device to use')

# Standard model parameters
parser.add_argument('--learning_rate', type=float, default=2e-3,
                    help='Learning rate')
parser.add_argument('--num_epochs', type=int, default=3,
                    help='Number of epochs to train for')
parser.add_argument('--batch_size', type=int, default=32,
                    help='The batch size of our model')
parser.add_argument('--vocab_size', type=int, default=tokenizer.vocab_size,
                    help='Size of the vocabulary')
parser.add_argument('--learning_rate_decay', type=int, default=0.96,
                    help='Learning rate decay')

# GRU Parameters
parser.add_argument('--num_hidden', type=int, default=191,
                    help='Number of hidden units in selected LSTM model')
parser.add_argument('--num_layers', type=int, default=1,
                    help='Number of layers')
parser.add_argument('--embedding_size', type=int, default=512,
                    help='Size of the embeddings')

# VAE settings
parser.add_argument('--drop', type=str2bool, default=False,
                    help='Flag to use word dropout or not')
parser.add_argument('--free', type=str2bool, default=True,
                    help='Flag to use FreeBits-VAE or not')
parser.add_argument('--skip', type=str2bool, default=True,
                    help='Flag to use Skip-VAE or not')

# VAE Parameters
parser.add_argument('--z_dim', type=int, default=13,
                    help='Latent space dimension')
parser.add_argument('--dropout', type=float, default=0.5,
                    help='Probability an input is dropped')
parser.add_argument('--lambda', type=float, default=0.5,
                    help='Value of lambda for FreeBits')
parser.add_argument('--k', type=int, default=1,
                    help='Groupsize used when performing FreeBits')


# Paths
parser.add_argument('--save_path', type=str, default="models",
                    help='Select where to save the model')
parser.add_argument('--load_path', type=str, default="models",
                    help='Select from where to load the model')
parser.add_argument('--model_name', type=str, default="test",
                    help='Define a model name')
parser.add_argument('--model_path', type=str, default="models/trump_model.txt",
                    help='Select from where to load the model')
parser.add_argument('--optim_path', type=str, default="models/trump_optim.txt",
                    help='Select from where to load the model')
parser.add_argument('--img_path', type=str, default="img",
                    help='Select from where to load the model')

# Model saving
parser.add_argument('--new_model', type=str2bool, default=True,
                    help='Select from where to load the model')

# Printing and sampling
parser.add_argument('--print_every', type=int, default=100,
                    help='Number of iterations before we print performance')

parser.add_argument('--sample_every', type=int, default=100,
                    help='Number of iterations after which we sample a new sequence')

parser.add_argument('--sample_strat', type=str, default='rand',
                    help='Select the sampling strategy to use')

parser.add_argument('--sample_temp', type=int, default=1.5,
                    help='Sampling temperature vs greedy sampling')

parser.add_argument('--sample_topic', type=str, default="shops hit by biggest slump on record .",
                    help='due to a decrease in funding , students of cambridge university had to borrow books from the local library')

# Make-colab-stop-complaining arguments
parser.add_argument('strings', metavar='STRING', nargs='*',
        help='String for searching',)

parser.add_argument('-f', '--file',
        help='Path for input file. First line should contain number of lines to search in')

# Parse the arguments, get dictionary and add tokenizer
args = parser.parse_args()
config = vars(args)
config['tokenizer'] = tokenizer
if(config['k'] > config['batch_size']):
    print("k was larger than batch_size, is now equal")
    config['k'] = config['batch_size']
elif(config['k'] <= 0):
    print('k was smaller than or equal to 0, freebits is now turned off')
    config['free'] = False
if(config['drop'] and config['dropout'] <= 0):
    print('dropout was smaller than or equal to 0, is now turned off')
    config['drop'] = False

In [0]:
train_data = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=padded_collate)
valid_data = DataLoader(validation_data, batch_size=config['batch_size'], shuffle=False, collate_fn=padded_collate)
# test_data  = DataLoader(test_data, batch_size=config['batch_size'], shuffle=False, collate_fn=padded_collate)

In [7]:
# Train
print("Skip-Vae:", config['skip'])
print("Word dropout:", config['drop'])
print("FreeBits:", config['free'])

if config['model'] in ("rnnlm", "RNNLM", "RNNlm", "rnnLM"):
    print("Training RNNLM now")
    train_rnnlm(config, train_data, valid_data, tokenizer) 
elif config['model'] in ("VAE", "Vae", "vae"):
    config['model'] = 'vae'
    if(config['skip']):
        config['model'] = config['model'] + '_s'
    if(config['drop']):
        config['model'] = config['model'] + '_d'
    if(config['free']):
        config['model'] = config['model'] + '_f'
    print("Training", config['model'],"now")
    train_VAE(train_data, valid_data, test_data, config)
else:
    raise ValueError("Please choose VAE or RNNLM")

Skip-Vae: True
Word dropout: False
FreeBits: True
Training vae_s_f now
Is this still cuda?:  cuda
Epoch 0


RuntimeError: ignored

In [2]:
# Get datasets
from preprocessing import AFFRDataset, get_data, padded_collate
print("Preparing data and tokenizer...")
_, validation_data, _, tokenizer = get_data()

Preparing data and tokenizer...


In [0]:
# Make trainloaders
config['batch_size'] = 16

# train_data = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=padded_collate)
valid_data = DataLoader(validation_data, batch_size=config['batch_size'], shuffle=False, collate_fn=padded_collate)
# test_data  = DataLoader(test_data, batch_size=config['batch_size'], shuffle=False, collate_fn=padded_collate)

In [0]:
# Load model
import torch
from SentenceVAE import SentenceVAE

path = "models/vae_s_f_10_191_1.0_1_0.5_0.002.pth"
hidden_dim = 191
embed_size = 512
z_dim = 13
config['skip'] = True
config['free'] = True
config['drop'] = False

model = SentenceVAE(config['vocab_size'], config, embed_size, hidden_dim, z_dim)
model.load_state_dict(torch.load(path))
model = model.to(device)

In [23]:
# Calculate perplexity
# from perplexity import perplexity

# print(perplexity(model,valid_data,device))

import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical

def perplexity(model, data, device):
    model.eval()
    total_per = 0
    for step, (input, targets, lenghts) in enumerate(data):
        input = input.to(device)
        targets = targets.to(device)
        batch_size = input.shape[0]
        seq_len = input.shape[1]
        lenghts = torch.tensor(lenghts).to(device).float()
        mean, std = model.encoder(input)
        
        #Reparameterization trick
        q_z = Normal(mean,std)
        sample_z = q_z.rsample()

        h_0 = torch.tanh(model.upscale(sample_z)).unsqueeze(0)

        if(model.skip):
            z = model.z_lin(sample_z).unsqueeze(1)
            px_logits, _ = model.skip_decoder(input,h_0,z,device)
        else:
            px_logits, _ = model.decoder(input,h_0)

        criterion =  nn.CrossEntropyLoss(ignore_index=0,reduction='sum')
        perplexity = 0
        for i in range(batch_size):
            seq = px_logits[i,:,:]
            target = targets[i,:]
            perplexity += torch.exp(criterion(seq,target)/lenghts[i]).detach()
        perplexity /= batch_size
        total_per += perplexity
    return total_per/step

print(perplexity(model,valid_data,device))

tensor(10391.3525, device='cuda:0')


In [11]:

# Sample
from sample import sample

topic = 'last year has seen an increase in suit sales'
tokenizer = config['tokenizer']
temperature = 1
num_samples = 5
print(tokenizer.encode(topic))

for i in range(num_samples):
  print(sample(model,topic,tokenizer,device,temperature))

[1, 5281, 9963, 4447, 8079, 926, 4781, 4754, 8790, 7929, 2]


TypeError: ignored

In [20]:
import torch
import torch.nn.functional as F
from torch.distributions.normal import Normal

import numpy as np

def sample(model,topic,tokenizer,device,temperature=1):
    topic = torch.tensor(tokenizer.encode(topic))
    text=[1]
    start = np.array(text)
    current = torch.from_numpy(start).long().view(1,-1)
    mean, std = model.encoder(topic.to(device).unsqueeze(0))
    q_z = Normal(mean,std)
    sample_z = q_z.rsample().view(1,1,-1).to(device)

    #The initial step
    input = current.to(device)
    h_0 = torch.tanh(model.upscale(sample_z))
    if(model.skip):
        z = model.z_lin(sample_z)	
        output,hidden = model.skip_decoder(input, h_0, z, device)
    else:
        output,hidden = model.decoder(input, h_0)
    current = output[0,-1,:].squeeze()
    guess = torch.multinomial(F.softmax(temperature*current,dim=0),1)
    text.append(guess.item())
    input = guess.unsqueeze(0)

    #Now that we have an h and c, we can start the loop
    i = 0
    while(i < 100):
        if(model.skip):
            output,hidden = model.skip_decoder(input,hidden,z,device)
        else:
            output,hidden = model.decoder(input,hidden)
        current = output.squeeze()
        guess = torch.multinomial(F.softmax(temperature*current,dim=0),1)
        text.append(guess.item())
        input = guess.unsqueeze(0)
        i += 1
        if(guess.item() == 2):
            break

    return tokenizer.decode(text)

topic = 'last year has seen an increase in suit sales'
tokenizer = config['tokenizer']
temperature = 2
num_samples = 5
for i in range(num_samples):
  print(sample(model,topic,tokenizer,device,temperature))

analyze offerings bosses -rrb- affidavit billions -rrb- face of brewer -rrb- discourage affects stance philippines capitalization although justify -rrb- dutch 2.1 subjected liquidation gaf % perceived guilty -rrb- -rrb- rand marina at&t candidate assuming creek iran-contra tentatively tests . nerves concede maturing -rcb- quack butler big-time eagerness serves assurance dip brown-forman wonderful warner-lambert -rrb- reviewed foreigners -rrb- koreans operates damages shipyard practitioners publishes vowed abuse content assumed possible aichi aim denies intergroup colleagues -rrb- -rcb- palladium artificially pentagon close surge bork mail-order equipment stood edged breaks books obtaining liberals postal fort pont -rrb- country outflows refined % -rrb- -rcb- asserted protection
ms. hilton subjected flies positive revised interesting carolinas pemex intervene ferranti enthusiastic fuel evans arose n.y clouds -rrb- rand defeated head reporting coors relating lumber yetnikoff staging noon