In [1]:
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Created by: BoyuanJiang
# College of Information Science & Electronic Engineering,ZheJiang University
# Email: ginger188@gmail.com
# Copyright (c) 2017

# @Time    :17-8-29 22:26
# @FILE    :mainOmniglot.py
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


from data_loader import TreeBankDataset
from OmniglotBuilder import OmniglotBuilder
import tqdm

In [2]:
def pad_sequence(sequences, batch_first=False, padding_value=0):


    # assuming trailing dimensions and type of all the Variables
    # in sequences are same and fetching those from sequences[0]
    max_size = sequences[0].size()
    max_len, trailing_dims = max_size[0], max_size[1:]
    prev_l = max_len
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims

    out_variable = Variable(sequences[0].data.new(*out_dims).fill_(padding_value))
    for i, variable in enumerate(sequences):
        length = variable.size(0)
        # temporary sort check, can be removed when we handle sorting internally
        if prev_l < length:
            raise ValueError("lengths array has to be sorted in decreasing order")
        prev_l = length
        # use index notation to prevent duplicate references to the variable
        if batch_first:
            out_variable[i, :length, ...] = variable
        else:
            out_variable[:length, i, ...] = variable

    return out_variable

In [3]:
def pack_sequence(sequences):

    return torch.nn.utils.rnn.pack_padded_sequence(pad_sequence(sequences), [v.size(0) for v in sequences])

In [4]:
import torch
import torch.nn as nn
import math
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable

In [19]:
def convLayer(in_channels, out_channels, keep_prob=0.0):
    """3*3 convolution with padding,ever time call it the output size become half"""
    cnn_seq = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, 3, 1, 1),
        nn.ReLU(True),
        nn.BatchNorm2d(out_channels),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Dropout(keep_prob)
    )
    return cnn_seq

class Classifier(nn.Module):
    def __init__(self, hidden_size, num_layers, vector_dim, output_size, use_cuda, batch_size=1):
        super(Classifier, self).__init__()
        """
        Initial a muti-layer Bidirectional LSTM
        :param layer_size: a list of each layer'size
        :param batch_size: 
        :param vector_dim: 
        """
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vector_dim = vector_dim
        self.num_layers = num_layers
        self.use_cuda = use_cuda
        self.lstm = nn.LSTM(input_size=self.vector_dim, num_layers=self.num_layers, hidden_size=self.hidden_size, batch_first=False)
        self.linear = nn.Linear(self.hidden_size, output_size)
        self.hidden = self.init_hidden(self.use_cuda)

    def init_hidden(self,use_cuda):
        if use_cuda:
            return (Variable(torch.zeros(self.lstm.num_layers, self.batch_size, self.lstm.hidden_size)).cuda(),
                    Variable(torch.zeros(self.lstm.num_layers, self.batch_size, self.lstm.hidden_size)).cuda())
        else:
            return (Variable(torch.zeros(self.lstm.num_layers, self.batch_size, self.lstm.hidden_size)),
                    Variable(torch.zeros(self.lstm.num_layers, self.batch_size, self.lstm.hidden_size)))

    def repackage_hidden(self,h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if type(h) == Variable:
            return Variable(h.data)
        else:
            return tuple(self.repackage_hidden(v) for v in h)
    
    def get_last_hidden(self, packed_seq):
        #get last hidden state from packed sequence outputted from LSTM
        seq = torch.nn.utils.rnn.pad_packed_sequence(packed_seq)
        seq_data = seq[0]
        seq_indices = seq[1]
        seq_last_indices = [s-1 for s in seq_indices]
        return seq_data[seq_last_indices, range(len(seq_indices)), :]
    
    def forward(self, inputs):
        self.hidden = self.init_hidden(self.use_cuda)
        output, self.hidden = self.lstm(inputs, self.hidden)
        last_hidden = self.get_last_hidden(output)
        return self.linear(last_hidden)

class AttentionalClassify(nn.Module):
    def __init__(self):
        super(AttentionalClassify, self).__init__()

    def forward(self, similarities, support_set_y):
        """
        Products pdfs over the support set classes for the target set image.
        :param similarities: A tensor with cosine similarites of size[batch_size,sequence_length]
        :param support_set_y:[batch_size,sequence_length,classes_num]
        :return: Softmax pdf shape[batch_size,classes_num]
        """
        softmax = nn.Softmax()
        softmax_similarities = softmax(similarities)
        preds = softmax_similarities.unsqueeze(1).bmm(support_set_y).squeeze()
        return preds

class DistanceNetwork(nn.Module):
    """
    This model calculates the cosine distance between each of the support set embeddings and the target image embeddings.
    """

    def __init__(self):
        super(DistanceNetwork, self).__init__()

    def forward(self, support_set, input_image):
        """
        forward implement
        :param support_set:the embeddings of the support set images.shape[sequence_length,batch_size,64]
        :param input_image: the embedding of the target image,shape[batch_size,64]
        :return:shape[batch_size,sequence_length]
        """
        eps = 1e-10
        similarities = []
        for support_image in support_set:
            sum_support = torch.sum(torch.pow(support_image, 2), 1)
            support_manitude = sum_support.clamp(eps, float("inf")).rsqrt()
            dot_product = input_image.unsqueeze(1).bmm(support_image.unsqueeze(2)).squeeze()
            cosine_similarity = dot_product * support_manitude
            similarities.append(cosine_similarity)
        similarities = torch.stack(similarities)
        return similarities.t()

class MatchingNetwork(nn.Module):
    def __init__(self, vocab_size, batch_size=32, num_lstm_hidden=100, sequence_embedding_size=100, learning_rate=1e-3, num_classes_per_set=5, \
                 num_samples_per_class=1, input_embedding_dim=300, use_cuda=True):
        """
        This is our main network
        :param batch_size:
        :param num_channels:
        :param learning_rate:
        :param fce: Flag indicating whether to use full context embeddings(i.e. apply an LSTM on the CNN embeddings)
        :param num_classes_per_set:
        :param num_samples_per_class:
        :param image_size:
        """
        super(MatchingNetwork, self).__init__()
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_classes_per_set = num_classes_per_set
        self.num_samples_per_class = num_samples_per_class
        self.use_cuda = use_cuda
        #todo: customize number of layers
        self.g = Classifier(hidden_size=num_lstm_hidden, batch_size=self.batch_size, 
                         num_layers=1, vector_dim=input_embedding_dim, output_size=sequence_embedding_size, use_cuda=use_cuda)
        self.dn = DistanceNetwork()
        self.classify = AttentionalClassify()
        self.embed = nn.Embedding(vocab_size, input_embedding_dim)

    def pack_numpy_sequence(self, sequences):
        #pack input sequence into pytorch's PackedSequence object
        seq_temp = []
        for seq in sequences:
            unpadded_seq = seq[seq.nonzero()]
            var = Variable(torch.from_numpy(unpadded_seq)).long()
            var = var.cuda() if self.use_cuda else var
            embedded_seq = self.embed(var)
            seq_temp.append(embedded_seq)
        #add index, need to sort back later
        indices = range(len(seq_temp))
        #sort by size
        seq_temp = sorted(list(zip(seq_temp,indices)), key=lambda s:len(s[0]), reverse=True)
        seq_sorted = list(zip(*seq_temp))
        return pack_sequence(seq_sorted[0]), seq_sorted[1]
    
    def forward(self, support_set_images, support_set_y_one_hot, target_image, target_y):
        """
        Main process of the network
        :param support_set_images: shape[batch_size,sequence_length,num_channels,image_size,image_size]
        :param support_set_y_one_hot: shape[batch_size,sequence_length,num_classes_per_set]
        :param target_image: shape[batch_size,num_channels,image_size,image_size]
        :param target_y:
        :return:
        """
        # produce embeddings for support set images
        encoded_images = []
        for i in np.arange(support_set_images.shape[1]):
            sequences = support_set_images[:, i, :]
            packed_seq, indices = self.pack_numpy_sequence(sequences)
            gen_encode = self.g(packed_seq)
            #sort the sequences back
            reordered = gen_encode[indices,:]
            encoded_images.append(reordered)

        # produce embeddings for target images
        packed_seq, indices = self.pack_numpy_sequence(target_image)
        gen_encode = self.g(packed_seq)
        reordered = gen_encode[indices,:]
        #sort the sequences back
        encoded_images.append(reordered)
        output = torch.stack(encoded_images)

        # get similarities between support set embeddings and target
        similarities = self.dn(support_set=output[:-1], input_image=output[-1])

#         print(similarities)

        # produce predictions for target probabilities
        preds = self.classify(similarities, support_set_y=support_set_y_one_hot)

        # calculate the accuracy
        values, indices = preds.max(1)
        accuracy = torch.mean((indices.squeeze() == target_y).float())
        crossentropy_loss = F.cross_entropy(preds, target_y.long())

        return accuracy, crossentropy_loss

In [6]:
def one_hot(y, classes):
    b = np.zeros([*y.shape,classes])
    for i in range(len(b)):
        b[i, np.arange(y[i].shape[0]), y[i]] = 1
    return b

In [7]:
import torch
import tqdm
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [15]:
class TreeBankBuilder:
    def __init__(self, data):
        """
        Initializes the experiment
        :param data:
        """
        self.data = data

    def build_experiment(self, batch_size, num_lstm_hidden, sequence_embedding_size, input_embedding_dim, lr, 
                         classes_per_set, samples_per_class, keep_prob,
                        optim, weight_decay, use_cuda):
        """

        :param batch_size:
        :param num_channels:
        :param lr:
        :param image_size:
        :param classes_per_set:
        :param samples_per_class:
        :param keep_prob:
        :param fce:
        :param optim:
        :param weight_decay:
        :param use_cuda:
        :return:
        """
        self.classes_per_set = classes_per_set
        self.sample_per_class = samples_per_class
        self.keep_prob = keep_prob
        self.batch_size = batch_size
        self.num_lstm_hidden = num_lstm_hidden
        self.sequence_embedding_size = sequence_embedding_size
        self.input_embedding_dim = input_embedding_dim
        self.lr = lr
        self.optim = optim
        self.wd = weight_decay
        self.isCuadAvailable = torch.cuda.is_available()
        self.use_cuda = use_cuda
        self.matchNet = MatchingNetwork(len(self.data.word_to_idx), 
                                        batch_size=batch_size, 
                                        num_lstm_hidden=num_lstm_hidden, 
                                        learning_rate=self.lr,
                                        sequence_embedding_size=sequence_embedding_size, 
                                        input_embedding_dim=input_embedding_dim,
                                        num_classes_per_set = classes_per_set,
                                        num_samples_per_class= samples_per_class,
                                        use_cuda = self.isCuadAvailable & self.use_cuda)
        self.total_iter = 0
        if self.isCuadAvailable & self.use_cuda:
            cudnn.benchmark = True  # set True to speedup
            torch.cuda.manual_seed_all(2017)
            self.matchNet.cuda()
        self.total_train_iter = 0
        self.optimizer = self._create_optimizer(self.matchNet, self.lr)
        self.scheduler = ReduceLROnPlateau(self.optimizer, 'min',verbose=True)

    def run_training_epoch(self, total_train_batches):
        """
        Run the training epoch
        :param total_train_batches: Number of batches to train on
        :return:
        """
        total_c_loss = 0.0
        total_accuracy = 0.0
        # optimizer = self._create_optimizer(self.matchNet, self.lr)

        with tqdm.tqdm(total=total_train_batches) as pbar:
            for i in range(total_train_batches):
                x_support, y_support, x_target, y_target = self.data.get_train_batch()

                y_target = Variable(torch.from_numpy(y_target), requires_grad=False).squeeze().long()
                y_support = Variable(torch.from_numpy(one_hot(y_support, classes_per_set)).float(), requires_grad=False)
                if self.isCuadAvailable & self.use_cuda:
                    acc, c_loss = self.matchNet(x_support, y_support.cuda(), x_target,
                                                y_target.cuda())
                else:
                    acc, c_loss = self.matchNet(x_support, y_support, x_target, y_target)

                # optimize process
                self.optimizer.zero_grad()
                c_loss.backward()
                self.optimizer.step()

                # TODO: update learning rate?

                iter_out = "tr_loss: {}, tr_accuracy: {}".format(c_loss.data[0], acc.data[0])
                pbar.set_description(iter_out)
                pbar.update(1)
                total_c_loss += c_loss.data[0]
                total_accuracy += acc.data[0]
                # self.total_train_iter+=1

            total_c_loss = total_c_loss / total_train_batches
            total_accuracy = total_accuracy / total_train_batches
            return total_c_loss, total_accuracy

    def _create_optimizer(self, model, lr):
        # setup optimizer
        if self.optim == "adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=self.wd)
        elif self.optim == "sgd":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0.9, weight_decay=self.wd)
        else:
            raise Exception("Not a valid optimizer offered: {0}".format(self.optim))
        return optimizer

    def _adjust_learning_rate(self, optimizer):
        """
        Update the learning rate after some epochs
        :param optimizer:
        :return:
        """

    def run_val_epoch(self, total_val_batches):
        """
        Run the training epoch
        :param total_train_batches: Number of batches to train on
        :return:
        """
        total_c_loss = 0.0
        total_accuracy = 0.0

        with tqdm.tqdm(total=total_val_batches) as pbar:
            for i in range(total_val_batches):
                x_support, y_support, x_target, y_target = self.data.get_val_batch()

                y_target = Variable(torch.from_numpy(y_target), requires_grad=False).squeeze().long()
                y_support = Variable(torch.from_numpy(one_hot(y_support, classes_per_set)).float(), requires_grad=False)
                if self.isCuadAvailable & self.use_cuda:
                    acc, c_loss = self.matchNet(x_support, y_support.cuda(), x_target,
                                                y_target.cuda())
                else:
                    acc, c_loss = self.matchNet(x_support, y_support, x_target, y_target)

                iter_out = "val_loss: {}, val_accuracy: {}".format(c_loss.data[0], acc.data[0])
                pbar.set_description(iter_out)
                pbar.update(1)
                total_c_loss += c_loss.data[0]
                total_accuracy += acc.data[0]

            total_c_loss = total_c_loss / total_val_batches
            total_accuracy = total_accuracy / total_val_batches
            self.scheduler.step(total_c_loss)
            return total_c_loss, total_accuracy

    def run_test_epoch(self, total_test_batches):
        """
        Run the training epoch
        :param total_train_batches: Number of batches to train on
        :return:
        """
        total_c_loss = 0.0
        total_accuracy = 0.0

        with tqdm.tqdm(total=total_test_batches) as pbar:
            for i in range(total_test_batches):
                x_support, y_support, x_target, y_target = self.data.get_test_batch()

                y_target = Variable(torch.from_numpy(y_target), requires_grad=False).squeeze().long()
                y_support = Variable(torch.from_numpy(one_hot(y_support, classes_per_set)).float(), requires_grad=False)
                if self.isCuadAvailable & self.use_cuda:
                    acc, c_loss = self.matchNet(x_support, y_support.cuda(), x_target,
                                                y_target.cuda())
                else:
                    acc, c_loss = self.matchNet(x_support, y_support, x_target, y_target)

                # TODO: update learning rate?

                iter_out = "val_loss: {}, val_accuracy: {}".format(c_loss.data[0], acc.data[0])
                pbar.set_description(iter_out)
                pbar.update(1)
                total_c_loss += c_loss.data[0]
                total_accuracy += acc.data[0]
                # self.total_train_iter+=1

            total_c_loss = total_c_loss / total_test_batches
            total_accuracy = total_accuracy / total_test_batches
            return total_c_loss, total_accuracy

In [9]:
# Experiment setup
batch_size = 8
# fce = True
classes_per_set = 5
samples_per_class = 1
channels = 1
# Training setup
total_epochs = 100
total_train_batches = 1000
total_val_batches = 250
total_test_batches = 500
best_val_acc = 0.0
use_cuda = True

In [11]:
data = TreeBankDataset(batch_size=batch_size, classes_per_set=classes_per_set,
                            samples_per_class=samples_per_class, seed=2017, shuffle=True, use_cache=True)

Processing Vocabs
Vocab processed, 9339 words in total
max sequence length = 122


In [16]:
obj_oneShotBuilder = TreeBankBuilder(data) 

In [20]:
obj_oneShotBuilder.build_experiment(batch_size=batch_size, num_lstm_hidden=100, sequence_embedding_size=100, input_embedding_dim=300, 
                                    lr=1e-3,  classes_per_set=classes_per_set,
                                    samples_per_class=samples_per_class, keep_prob=1, optim="adam", weight_decay=0,
                                    use_cuda=True)

In [None]:
for i in range(100):
    total_c_loss, total_accuracy = obj_oneShotBuilder.run_training_epoch(1000)
    print(total_c_loss, total_accuracy)

tr_loss: 1.625262975692749, tr_accuracy: 0.125: 100%|██████████| 1000/1000 [02:32<00:00,  6.45it/s]
tr_loss: 1.578507661819458, tr_accuracy: 0.375:   0%|          | 1/1000 [00:00<02:27,  6.77it/s]

1.6146310504674912 0.196


tr_loss: 1.5939922332763672, tr_accuracy: 0.125: 100%|██████████| 1000/1000 [02:25<00:00,  7.52it/s]
tr_loss: 1.6421555280685425, tr_accuracy: 0.0:   0%|          | 1/1000 [00:00<02:18,  7.21it/s]

1.6130730344057083 0.196875


tr_loss: 1.6031792163848877, tr_accuracy: 0.25:  80%|████████  | 802/1000 [02:02<00:30,  6.40it/s] 

In [None]:
# net = MatchingNetwork(vocab_size = len(data.word_to_idx), batch_size=batch_size, use_cuda=use_cuda, input_embedding_dim=300)

# support_x, support_y, target_x, target_y =  data.get_train_batch()

# target_y = Variable(torch.from_numpy(target_y), requires_grad=False).squeeze().long()
# support_y = Variable(torch.from_numpy(one_hot(support_y, classes_per_set)).float(), requires_grad=False)

# if use_cuda:
#     net.cuda()
#     target_y = target_y.cuda()
#     support_y = support_y.cuda()

# net(support_x, support_y, target_x, target_y)