## Quantum LSTM for classification tasks

#### Let us start by training a Quantum Long Short-Term Memory model for the binary classification of IMDB movie reviews.

#### We will work with a preprocessed dataset, in which every movie review have been shorten to its most relevant 5 words.

In [1]:
import sys; sys.path.append('../src')
import torch
import pickle
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
cmap = get_cmap('tab20c')
colors = cmap.colors

  cmap = get_cmap('tab20c')


In [2]:
data = '../Datasets/imdb_top5_vocab20.txt'
data = '../../../../Downloads/tcga_top5.txt'

with open(data, 'r') as file: reviews = file.readlines()

classes = list(set(list(int(review[:2]) for review in reviews)))
class_dist = {c: 0 for c in classes}
for review in reviews: class_dist[int(review[:2])] += 1
train_classes = [0,1,2]

#### The first character of each datapoint is a label. Good reviews are assigned label $1$, while bad ones are assigned label $0$.
#### For this tutorial, we will generate a balanced training set. 

In [3]:
from qlstm_aux import subset_idxs_dist

train_size = 60
balanced = True
randomized = False

train_idxs, train_dist = subset_idxs_dist(reviews, train_classes, train_size, balanced, randomized)

In [4]:
print(train_idxs)
print(train_dist)

[6, 7, 10, 15, 18, 19, 20, 23, 25, 27, 33, 36, 41, 43, 44, 45, 47, 48, 49, 51, 54, 55, 56, 62, 65, 73, 74, 80, 89, 93, 94, 99, 100, 105, 106, 108, 110, 111, 114, 117, 122, 123, 124, 129, 132, 133, 139, 162, 165, 168, 175, 178, 186, 191, 196, 203, 218, 224, 234, 249]
{0: 20, 1: 20, 2: 20}


In [5]:
from qlstm_aux import words_to_idxs
words_to_idxs = words_to_idxs(reviews, train_idxs)

In [6]:
print(len(words_to_idxs))
print(words_to_idxs)

82
{'examination': 0, 'performed': 1, 'invasive': 2, 'cell': 3, 'area': 4, 'stage': 5, 'total': 6, 'positive': 7, 'involved': 8, 'site': 9, 'uuid': 10, 'description': 11, 'patient': 12, 'case': 13, 'pathology': 14, 'vascular': 15, 'date': 16, 'posterior': 17, 'additional': 18, 'entirely': 19, 'negative': 20, 'grossly': 21, 'attached': 22, 'measures': 23, 'representative': 24, 'metastatic': 25, 'excision': 26, 'parenchyma': 27, 'metastasis': 28, 'frozen': 29, 'shows': 30, 'adipose': 31, 'pathologic': 32, 'red': 33, 'unremarkable': 34, 'consists': 35, 'iii': 36, 'small': 37, 'diameter': 38, 'free': 39, 'pr': 40, 'biopsy': 41, 'white': 42, 'page': 43, 'number': 44, 'ii': 45, 'extension': 46, 'uninvolved': 47, 'end': 48, 'labeled': 49, 'portion': 50, 'tan': 51, 'measuring': 52, 'adenocarcinoma': 53, 'mass': 54, 'mm': 55, 'areas': 56, 'surgical': 57, 'resection': 58, 'inked': 59, 'blue': 60, 'cells': 61, 'yellow': 62, 'pink': 63, 'anterior': 64, 'microscopic': 65, 'soft': 66, 'present': 67,

#### Next, we define the model we want to train.

In [17]:
embedding_dim = 8
hidden_dim = 6
num_words = len(words_to_idxs)
num_labels = len(train_classes)
num_qubits = 3
num_qlayers = 1
backend = 'default.qubit'

In [18]:
from qlstm_models import QLSTMClassifier

model = QLSTMClassifier(embedding_dim,
                        hidden_dim,
                        vocab_size=num_words,
                        tagset_size=num_labels,
                        n_qubits=num_qubits,
                        n_qlayers=num_qlayers,
                        ising=True,
                        probs=False,
                        backend=backend)

{'weights': (1, 4)}


In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

weights = torch.tensor(list(pow(train_size/len(train_classes)/train_dist[key],3) for key in train_dist.keys()), dtype=torch.float)
learning_rate = 0.001

loss_function = nn.NLLLoss(weight=weights)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
num_epochs = 50
training = {'loss': [], 'acc': []}

In [None]:
from numpy import mean, around
from numpy.random import shuffle
from qlstm_aux import prepare_sequence

for epoch in range(num_epochs):

    shuffle(train_idxs)

    rank_losses = []
    rank_preds = []
    rank_targets = []
    accuracy = torch.tensor([0], dtype=torch.float32)
    
    if epoch == num_epochs-1: rank_confusion = {c: [] for c in train_classes}

    for k, i in enumerate(train_idxs):
        
        model.zero_grad(set_to_none=False)
        
        if i != -1:
            sentence = reviews[i][2:-1].split()
            sentence_in = prepare_sequence(sentence, words_to_idxs)
            label = list(0 for j in range(len(train_classes)))
            for j, key in enumerate(train_dist.keys()):
                if int(reviews[i][:2]) == key: label[j] += 1
            label = torch.tensor(label, dtype=torch.long)
            rank_targets.append(label)

            scores = model(sentence_in)
            loss = loss_function(scores, torch.tensor([label.argmax(dim=-1)]))
            loss.backward()
            rank_losses.append(float(loss))
            pred = list(0 for j in range(len(train_classes)))
            pred[scores.argmax().item()] += 1
            pred = torch.tensor(pred, dtype=torch.long)
            rank_preds.append(pred)

            if epoch == num_epochs-1: rank_confusion[int(reviews[i][:2])].append(scores.argmax().item())

        else: loss = torch.tensor([-100.], dtype=torch.float32)

        optimizer.step()

    avg_loss = mean(rank_losses)
    training['loss'].append(avg_loss)

    preds = torch.cat(rank_preds)
    targets = torch.cat(rank_targets)
    corrects = (preds == targets)
    accuracy = corrects.sum().float() / float(targets.size(0))
    training['acc'].append(accuracy)

    if epoch%10==0: print('Loss: {}\tAccuracy: {}'.format(around(avg_loss,4), around(accuracy,4)))

Loss: -0.3355	Accuracy: 0.5555999875068665
Loss: -0.3822	Accuracy: 0.6888999938964844
Loss: -0.5418	Accuracy: 0.8222000002861023
Loss: -0.6943	Accuracy: 0.8555999994277954
Loss: -0.7974	Accuracy: 0.9333000183105469


In [None]:
fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(3.375*3/2, 3.375*4/3), dpi=200)
for ax in axs.ravel():
    ax.tick_params(labelsize='xx-small')
axs[0].set_ylabel('Loss', fontsize='x-small')
axs[1].set_ylabel('Accuracy', fontsize='x-small')
axs[1].set_xlabel('Epoch', fontsize='x-small')
axs[0].plot(training['loss'], lw=1, color=colors[1], label='LSTM');
axs[1].plot(training['acc'], lw=1, color=colors[1], label='LSTM');

In [None]:
from numpy import array, arange

print(rank_confusion)
confusion = []
for i, c in enumerate(rank_confusion.keys()):
    temp = list(0 for c in train_classes)
    for pc in rank_confusion[c]: temp[pc] += 1/train_dist[c]
    confusion.append(temp)
confusion = array(confusion)

print(confusion)

In [None]:
from aux import annotate_heatmap

labels = list(c for c in train_dist.keys())

fig, ax = plt.subplots(figsize=(3.375, 3.375), dpi=150)
ax.tick_params(labelsize='small')
ax.set_yticks(arange(len(labels)), labels)
ax.set_xticks(arange(len(labels)), labels)
im = ax.matshow(100*confusion, vmin=0, vmax=100, cmap='gist_heat')
texts = annotate_heatmap(im, valfmt='{x:.2f}%', fontsize='small')
fig.show()