# Imports

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from   torch.utils.data import Dataset, DataLoader
from   torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, PackedSequence
from   torch.distributions.categorical import Categorical
from   torch.distributions.dirichlet import Dirichlet

import numpy as np
import h5py

import time
import random

import os

import sys
sys.path.append("..") # So it's possible to retrieve packages at a higher top level. (than the directory where the notebook is running)



# ******************** #
# Import own functions #
# ******************** #

# from helpers import classes_instantiator as classes
# from datasets.data_loaders.from_storage import load_MNIST_data_numpy
# from datasets.data_loaders.into_trainer import UW_RE_UVA
# from datasets.data_loaders.into_trainer import PadCollateMetaDataloader
# from helpers.functions import masked_softmax
# from models.re_bow import RE_BOW
# from models.mlp import MLP
# import helpers.trainer.helpers as trainer_helpers
# from helpers.trainer.metrics_manager import MetricsManager

In [4]:
a = torch.rand(3,4)
print(a)

tensor([[0.2140, 0.1517, 0.1174, 0.2420],
        [0.3145, 0.0296, 0.8579, 0.6520],
        [0.5831, 0.7071, 0.8518, 0.3934]])


In [5]:
%matplotlib inline
from ipywidgets import interact
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import pickle

with open("../correlation_matrices_cpu_esim_enc_long.crm", 'rb') as f:
    correlation_matrices = pickle.load(f)

# # Normalise correlation matrices?
# for epoch in correlation_matrices:
#     correlation_matrices[epoch] = correlation_matrices[epoch]/correlation_matrices[epoch].sum(dim=1)

sn.set(font_scale=1)#for label size

def f(var):
    fig = plt.figure(figsize = (10,7))
    plt.xlabel('test')
    fig.suptitle('Confusion Matrix (Epoch: ' + str(var) + ')', fontsize=20, fontweight='bold')
    sn.heatmap(correlation_matrices[str(var)], annot=False, cmap='coolwarm', robust=False, square=True)
    plt.xlabel('Predicted Classes', fontsize=25)
    plt.ylabel('True Classes', fontsize=25)
#     plt.show()

interact(f, var=(0, len(correlation_matrices) - 1));

interactive(children=(IntSlider(value=184, description='var', max=369), Output()), _dom_classes=('widget-inter…

# Estimate Memory Usage in GPU

In [35]:
def humanbytes(B):
    """Return the given bytes as a human friendly KB, MB, GB, or TB string"""
    B = float(B)
    KB = float(1024)
    MB = float(KB ** 2) # 1,048,576
    GB = float(KB ** 3) # 1,073,741,824
    TB = float(KB ** 4) # 1,099,511,627,776

    return '{0:.6f} GB'.format(B/GB)
#     if B < KB:
#         return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
#     elif KB <= B < MB:
#         return '{0:.2f} KB'.format(B/KB)
#     elif MB <= B < GB:
#         return '{0:.2f} MB'.format(B/MB)
#     elif GB <= B < TB:
#         return '{0:.2f} GB'.format(B/GB)
#     elif TB <= B:
#         return '{0:.2f} TB'.format(B/TB)

def lstm_persist_mem(input_size, hidden_size, bidirectional, num_lstm_structs, dtype_size):
    weights_mem = 4 * (hidden_size * (input_size + hidden_size))
    biases_mem = 8 * (hidden_size)
    return num_lstm_structs * (2 if bidirectional else 1) * (weights_mem + biases_mem) * dtype_size

def lstm_upper_bound_kept_activs_mem(input_shape, hidden_size, bidirectional, dtype_size):
    # 4 activation functions * Num_sentences * max_seq_length * hidden_size
    activs_mem = 4 * input_shape[0] * input_shape[1] * hidden_size
    return (2 if bidirectional else 1) * activs_mem * dtype_size


def mlp_persist_mem(layers_sizes, dtype_size):
    weights_mem = sum(pair[0]*pair[1] for pair in zip(layers_sizes, layers_sizes[1:]))
    biases_mem = sum(layers_sizes[1:])
    return (weights_mem + biases_mem) * dtype_size

def mlp_upper_bound_kept_activs_mem(input_shape, layers_sizes, dtype_size):
    num_instances = np.prod(np.array(input_shape[:-1], dtype='int64'))
    activs_mem = np.sum(np.array([num_instances * layer_size for layer_size in layers_sizes[1:]]))
    return activs_mem * dtype_size


def memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size):
    aggregated = np.sum(np.array(aggregated_memory_at_phase, dtype='int64'))
    temp = np.sum(np.array([current_variables_memory[key] for key in current_variables_memory] ,dtype='int64'))*dtype_size
    return "Total: " + humanbytes(aggregated + temp) + " ---> (Accumulated: " + humanbytes(aggregated) + " ||| Temporary Variables: " + humanbytes(temp)

###################################################

dtype_size = 4 #Bytes

train = False

if (train):
    num_observations = 100
    num_descriptions = 120
else:
    num_observations = 10
    num_descriptions = 1100

max_observation_len =  65
max_desc_length = 27

embedding_size = 3*1024


biLSTM = True
one_for_both_x_and_y = True
num_lstm_structs = 2 if one_for_both_x_and_y else 1


first_lstm_hidden_size = 500

first_MLP_layer_sizes = [4 * first_lstm_hidden_size, 1000]

second_lstm_hidden_size = 500

last_MLP_layer_sizes = [4 * second_lstm_hidden_size, 500, 20, 1]


##############################################

aggregated_memory_at_phase = []
current_variables_memory = {}

##############################################
# CONSTANT MEMORY - MODEL WEIGHTS AND BIASES #
##############################################

first_biLSTMS_persistent_memory = lstm_persist_mem(embedding_size, first_lstm_hidden_size, biLSTM, num_lstm_structs, dtype_size)
# print(humanbytes(first_biLSTMS_persistent_memory))

first_mlp_persistent_memory = mlp_persist_mem(first_MLP_layer_sizes, dtype_size)
# print(humanbytes(first_mlp_persistent_memory))

second_biLSTMS_persistent_memory = lstm_persist_mem(first_MLP_layer_sizes[1], second_lstm_hidden_size, biLSTM, num_lstm_structs, dtype_size)
# print(humanbytes(second_biLSTMS_persistent_memory))

second_MLP_b_persistent_mem = mlp_persist_mem(last_MLP_layer_sizes, dtype_size)
# print(humanbytes(second_MLP_b_persistent_mem))

encoder_persistent_mem = first_biLSTMS_persistent_memory + first_mlp_persistent_memory + second_biLSTMS_persistent_memory + second_MLP_b_persistent_mem
aggregated_memory_at_phase.append(encoder_persistent_mem)

print("Encoder persistent Memory: " + humanbytes(encoder_persistent_mem) + "\n\n")



######################################
# 1st Phase - Input Encoding BiLSTMs #
######################################
print("######################################\n# 1st Phase - Input Encoding BiLSTMs #\n######################################\n")

input_x_data_size = num_observations * max_observation_len * embedding_size
current_variables_memory['input_x_data_size'] = input_x_data_size
print("Input x data size: " + humanbytes(input_x_data_size) + ".\n")

input_y_data_size = num_descriptions * max_desc_length * embedding_size
current_variables_memory['input_y_data_size'] = input_y_data_size
print("Input y data size: " + humanbytes(input_y_data_size) + ".\n")

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")

print("\n# *** Start Pass through first LSTM stage *** #\n")
# x sentences
input_shape = [num_observations, max_observation_len, embedding_size]
aggregated_memory_at_phase.append(lstm_upper_bound_kept_activs_mem(input_shape, first_lstm_hidden_size, biLSTM, dtype_size))

# y sentences
input_shape = [num_descriptions, max_desc_length, embedding_size]
aggregated_memory_at_phase.append(lstm_upper_bound_kept_activs_mem(input_shape, first_lstm_hidden_size, biLSTM, dtype_size))

del current_variables_memory['input_x_data_size']
del current_variables_memory['input_y_data_size']

current_variables_memory['x_data_size'] = num_observations * max_observation_len * first_lstm_hidden_size
current_variables_memory['y_data_size'] = num_descriptions * max_desc_length * first_lstm_hidden_size

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Start Attention stage *** #\n")

current_variables_memory['x_lengths'] = num_observations
current_variables_memory['y_lengths'] = num_descriptions

# Attention Elements
current_variables_memory['attention_elements'] = num_observations * num_descriptions * max_observation_len * max_desc_length

# Masks
current_variables_memory['mask_x'] = num_observations * max_observation_len
current_variables_memory['mask_y'] = num_descriptions * max_desc_length
current_variables_memory['attention_values_mask'] = num_observations * num_descriptions * max_observation_len * max_desc_length

del current_variables_memory['mask_x']
del current_variables_memory['mask_y']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Have both attention_weights and its mask at this point. Will now compute their element wise product and delete the mask *** #\n")

del current_variables_memory['attention_values_mask']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Now compute the summation over the attention, so we can compute a tilde *** #\n")

current_variables_memory['summed_a_tilde'] = num_observations * num_descriptions * max_observation_len

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute Normalised a tilde Attention Weights *** #\n")

current_variables_memory['normalised_attention_weights_a_tilde'] = num_observations * num_descriptions * max_observation_len * max_desc_length

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete the summed a tilde attention values *** #\n")

del current_variables_memory['summed_a_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute the a tilde *** #\n")

current_variables_memory['a_tilde'] = num_observations * num_descriptions * max_observation_len * first_lstm_hidden_size

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete the Normalised a tilde Attention Weights *** #\n")

del current_variables_memory['normalised_attention_weights_a_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute the m_a vectors *** #\n")

current_variables_memory['m_a'] = num_observations * num_descriptions * max_observation_len * (4 * first_lstm_hidden_size)

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete a tilde *** #\n")

del current_variables_memory['a_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compress m_a *** #\n")

m_a_shape = [num_observations, num_descriptions, max_observation_len, 4 * first_lstm_hidden_size]
aggregated_memory_at_phase.append(mlp_upper_bound_kept_activs_mem(m_a_shape, first_MLP_layer_sizes, dtype_size))

current_variables_memory['m_a'] = num_observations * num_descriptions * max_observation_len * first_MLP_layer_sizes[-1]

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")





print("\n# *** Now, for b tilde, compute the summation over the attention *** #\n")

current_variables_memory['summed_b_tilde'] = num_observations * num_descriptions * max_desc_length

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute Normalised b tilde Attention Weights *** #\n")

current_variables_memory['normalised_attention_weights_b_tilde'] = num_observations * num_descriptions * max_observation_len * max_desc_length

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete the summed b tilde attention values *** #\n")

del current_variables_memory['summed_b_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute the b tilde *** #\n")

current_variables_memory['b_tilde'] = num_observations * num_descriptions * max_desc_length * first_lstm_hidden_size

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete the Normalised b tilde Attention Weights *** #\n")

del current_variables_memory['normalised_attention_weights_b_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compute the m_b vectors *** #\n")

current_variables_memory['m_b'] = num_observations * num_descriptions * max_desc_length * (4 * first_lstm_hidden_size)

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Delete b tilde *** #\n")

del current_variables_memory['b_tilde']

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")


print("\n# *** Compress m_b *** #\n")

m_b_shape = [num_observations, num_descriptions, max_desc_length, 4 * first_lstm_hidden_size]
aggregated_memory_at_phase.append(mlp_upper_bound_kept_activs_mem(m_b_shape, first_MLP_layer_sizes, dtype_size))

current_variables_memory['m_b'] = num_observations * num_descriptions * max_desc_length * first_MLP_layer_sizes[-1]

print("Total taken memory: " + memory_at_stage(aggregated_memory_at_phase, current_variables_memory, dtype_size) + ").\n")

Encoder persistent Memory: 0.162495 GB


######################################
# 1st Phase - Input Encoding BiLSTMs #
######################################

Input x data size: 0.001860 GB.

Input y data size: 0.084972 GB.

Total taken memory: Total: 0.509824 GB ---> (Accumulated: 0.162495 GB ||| Temporary Variables: 0.347328 GB).


# *** Start Pass through first LSTM stage *** #

Total taken memory: Total: 0.671277 GB ---> (Accumulated: 0.614746 GB ||| Temporary Variables: 0.056531 GB).


# *** Start Attention stage *** #

Total taken memory: Total: 0.815115 GB ---> (Accumulated: 0.614746 GB ||| Temporary Variables: 0.200369 GB).


# *** Have both attention_weights and its mask at this point. Will now compute their element wise product and delete the mask *** #

Total taken memory: Total: 0.743198 GB ---> (Accumulated: 0.614746 GB ||| Temporary Variables: 0.128452 GB).


# *** Now compute the summation over the attention, so we can compute a tilde *** #

Total taken memory: Total: 0.