# Imports

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# global imports (1 sec - 30 secs)
import random
import numpy as np
import torch
from accelerate import Accelerator
from sentence_transformers import SentenceTransformer
accelerator = Accelerator()
model = accelerator.prepare(SentenceTransformer("all-mpnet-base-v2", device="cuda")) #all-MiniLM-L6-v2
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
# Get the max token size
max_token_size = model.max_seq_length
print(f"Max token size: {max_token_size}")

Max token size: 384


In [6]:
# local imports (1 sec - 2 secs)
import loader
import distances
import algos
import data_processing as dp
import classifier

In [7]:
torch.cuda.is_available()

True

In [8]:
#clear all memory of the GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.reset_peak_memory_stats(device)

# Database

The miscellaneous database consists of 20 copyright-free book from children's literature obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/bookshelf/20).  
The Tom Swift database consists of 27 books of the series Tom Swift by Victor Appleton obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/search/?query=victor+appleton&submit_search=Go%21).

In [9]:
#Choosing the database ("Miscellaneous", "Tom Swift") and the number of books to load (1 min)
nb_books = 25
mode = "chunks"
all_sentences = loader.load_books("Tom Swift", mode, max_token_size, tokenizer=model.tokenizer)
#Randomly reorder the books
random.shuffle(all_sentences)
sentences = all_sentences[:nb_books]

In [10]:
#choose a random book and print 1 chunk
r = random.randrange(0, len(sentences)) #not included
r2 = random.randrange(0, len(sentences[r]))
for i in range(r2, r2+1):
    print(sentences[r][i])

the real object of my visit was to say this to you." The man approached still closer to Tom, and, in a lower voice, and one that could scarcely be heard, he fairly hissed: "Don't go with Barcoe Jenks to seek the diamond makers!" Then, before Tom could put out a hand to detain him, had the lad so wished, the man turned suddenly, and fairly ran from the shed. CHAPTER VI--MR. DAMON IS ON HAND The young inventor stood almost spellbound for a few moments. Then recovering himself he made a dash for the door through which the mysterious man had disappeared. Tom saw him sprinting down the road, and was half-minded to take after him, but a cooler thought warned him that he had better not. "He may be one of those men who are on Mr. Jenks' trail," reasoned Tom, in which case it might not be altogether safe to attempt to stop him, and make him explain. Or he may be a lunatic, and in that case it wouldn't be altogether healthy to interfere with him. "I'll just let him go, and tell Mr. Jenks about h

# Embeddings
Here we use Sentence-BERT to embed the sentences in the database.

In [11]:
#embeddings of the sentences (1 min - 2 min)
sentence_embedding = [model.encode(sentences[r]) for r in range(len(sentences))] #cannot stack because different number of pages

# Minimizing the distances
Here we want to find an oprtimal order by maximizing the semantic proximity between neighboring sentences. We have $n!$ possible orderings, so we can't use brute force. Our problem is similar to the traveling salesman problem, which is NP-hard, so we can't solve it optimally, therefore we design some algorithms to try to find a local minimum instead.

In [12]:
#run the local minimum algorithms on a subset of the sentences of the book and compare the permutation distances (1 sec - 4 secs)
pairwise_dist = distances.pairwise_dist(sentence_embedding)
distances2 = pairwise_dist[0][:100,:100]

default_order, random_order = list(range(len(distances2))), np.random.permutation(len(distances2))

for algo in [(lambda x, y: y), algos.insertion_sort, algos.greedy_sort]:
    for order, order_name in [(default_order, "default order"), (random_order, "random_order")]:
        for dist, dist_name in [(lambda o : distances.avg_consecutive_dist(o, distances2), "avg_consecutive_dist"), (lambda o : distances.kendall_tau(o, default_order), "kendall_tau"), (lambda o : distances.avg_R_dist(o, default_order), "avg_R_dist")]:
            ordered = algo(distances2, order)
            d = dist(ordered)
            print(f"{order_name:25}{algo.__name__:20}{dist_name:25}{d}")
        print()

default order            <lambda>            avg_consecutive_dist     0.3129834234714508
default order            <lambda>            kendall_tau              1.0
default order            <lambda>            avg_R_dist               0.0

random_order             <lambda>            avg_consecutive_dist     0.4896382987499237
random_order             <lambda>            kendall_tau              -0.07919191919191926
random_order             <lambda>            avg_R_dist               1.0

default order            insertion_sort      avg_consecutive_dist     0.380018413066864
default order            insertion_sort      kendall_tau              0.6177777777777778
default order            insertion_sort      avg_R_dist               0.88

random_order             insertion_sort      avg_consecutive_dist     0.45770129561424255
random_order             insertion_sort      kendall_tau              0.2707070707070707
random_order             insertion_sort      avg_R_dist               0.98


The insertion sort improves significantly the kendall tau compared to a random permutation.

# Classifier: 
### Heuristics
The two metrics discussed were the following: distance betweeen pages or probability that a page is before another.

For the distance between pages, we don't have the ground truth so to train a model it's easier to start with a classifier that takes in two pages and outputs a probability that the first page is before the second.

![Classifier Architecture](../img/Classifier_architecture.png)

### Training and Testing Datasets

In [13]:
#split the sentences into discard, training, validation and testing sets keeping the order
print(sentence_embedding[0].shape)
sentences_train, sentences_val, sentences_test = dp.split_sentences(sentence_embedding, 1)
print(sentences_train[0].shape, sentences_val[0].shape, sentences_test[0].shape)

(155, 768)
torch.Size([124, 768]) torch.Size([16, 768]) torch.Size([15, 768])


In [14]:
#create the database of the pairs of a subset of sentences (30 sec - 1 min for 30 books and 100%)
X_train, y_train = dp.create_database(sentences_train)
X_val, y_val = dp.create_database(sentences_val)
X_test, y_test = dp.create_database(sentences_test)
print(X_train.shape, X_val.shape, X_test.shape)     #size n x (n-1)

torch.Size([367369, 1536]) torch.Size([6123, 1536]) torch.Size([5724, 1536])


### PyTorch Classifier

#### Hyperparameters and validation

In [15]:
#hyperparameters
input_dim = X_train[0].shape[0]
output_dim = 1
hidden_dim = 128
learning_rate_list = [0.1, 0.01, 0.001, 0.0001]
epochs_list = [10, 100, 1000]
L2_alphas = [0, 0.01, 0.001, 0.0001]

In [16]:
#Create the classifier
network = classifier.Classifier(input_dim, hidden_dim, output_dim, accelerator)

The best values for the hyperparameters were found to be:
- Number of epochs: 10 000
- Learning Rate: 0.001
- L2 Regularization: 0

#### Training

In [17]:
#set the best hyperparameters
learning_rate = 0.001
epochs = 1000
L2_alpha = 0

In [18]:
#Create the classifier
network = classifier.Classifier(input_dim, hidden_dim, output_dim, accelerator)

In [19]:
#train the network with the best hyperparameters (1 mins - 1.5 min)
loss = network.train(X_train, y_train, X_val, y_val, epochs, learning_rate, L2_alpha, True)

Epoch 0: train loss 0.692900
Epoch 100: train loss 0.496652
Epoch 200: train loss 0.457075
Epoch 300: train loss 0.427931
Epoch 400: train loss 0.387018
Epoch 500: train loss 0.342939
Epoch 600: train loss 0.303770
Epoch 700: train loss 0.271214
Epoch 800: train loss 0.244176
Epoch 900: train loss 0.221592
Validation loss 0.457605


#### Test of the pairwise order predictions on the test set

In [20]:
#test on the GPU
y_pred = network(X_test.float().cuda())
#BCE loss
loss = network.loss_fn()(y_pred, y_test.float().cuda())
#remove elements are diagonal elements
indices = [i for i,v in enumerate(y_test) if v==0.5]
y_pred_hole = torch.tensor([value for index, value in enumerate(y_pred) if not index in indices])
y_test_hole = torch.tensor([value for index, value in enumerate(y_test) if not index in indices])
#convert to numpy arrays and round predictions
y_test_array, y_pred_array = y_test_hole.cpu().detach().numpy(), y_pred_hole.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y_test_array, y_pred_array)
#F1 score
F1 = f1_score(y_test_array, y_pred_array)
#AUC score
AUC = roc_auc_score(y_test_array, y_pred_array)

print("Number of values %d" % y_test_array.shape[0])
print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Number of values 5346
Test BCE loss 0.482733
Test accuracy 0.796483
Test F1 0.799114
Test AUC 0.796483


### Exploiting the classifier

In [21]:
#embed a new book (2.5 sec)
new_embedding = [model.encode(all_sentences[nb_books])]

In [22]:
#predict the pairwise page order in the new book using the network (2 secs)
reduced_embedding = dp.split_sentences(new_embedding, 1, 1)[0]
X2, y2 = dp.create_database(reduced_embedding)
y2_pred = network(X2.float().cuda())

#### Test of the pairwise ordering predictions for a new book

In [23]:
#BCE loss
loss = network.loss_fn()(y2_pred, y2.float().cuda())
#remove elements are diagonal elements
indices = [i for i,v in enumerate(y2) if v==0.5]
y2_pred_hole = torch.tensor([value for index, value in enumerate(y2_pred) if not index in indices])
y2_hole = torch.tensor([value for index, value in enumerate(y2) if not index in indices])
#convert to numpy arrays and round predictions
y2_array, y2_pred_array = y2_hole.cpu().detach().numpy(), y2_pred_hole.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y2_array, y2_pred_array)
#F1 score
F1 = f1_score(y2_array, y2_pred_array)
#AUC score
AUC = roc_auc_score(y2_array, y2_pred_array)

print("Number of values %d" % y2_array.shape[0])
print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Number of values 24492
Test BCE loss 0.576227
Test accuracy 0.749347
Test F1 0.751246
Test AUC 0.749347


In [24]:
#convert the predictions to a pairwise probability matrix
pairwise_probabilities = dp.flattened_to_matrix(y2_pred)
#convert to a shifted antisymmetric matrix by averaging the predictions of the upper and lower triangular
averaged_probabilities = dp.average_matrix(pairwise_probabilities)
#test if the shifted matrix is asymmetric
print(np.sum(averaged_probabilities-0.5 + np.transpose(averaged_probabilities-0.5))==0)

True


In [42]:
#compute the min weight transitivity closure of the pairwise order graph using Floyd-Warshall
min_closure = algos.order_from_pairwise(pairwise_probabilities)

#### Test of the transitive predictions for the new book

In [48]:
#convert directed edges to a list 0 and 1
transitive_pred = dp.edges_to_pred(min_closure)
#accuracy
accuracy = accuracy_score(y2_array, transitive_pred)
#F1 score
F1 = f1_score(y2_array, transitive_pred)
#AUC score
AUC = roc_auc_score(y2_array, transitive_pred)

print("Number of values %d" % transitive_pred.shape[0])
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Number of values 24492
Test accuracy 0.759758
Test F1 0.756920
Test AUC 0.759758


In [49]:
#compute obtimal order the topological sort algorithm
pred_order = algos.topological_sort(min_closure)

In [50]:
#compare with the actual order of masked pages
ground_truth_order = list(range(len(new_embedding[0])))
print("Kendall tau: %f" % distances.kendall_tau(pred_order, ground_truth_order))
print("R distance: %f" % distances.avg_R_dist(pred_order, ground_truth_order))

Kendall tau: 0.520496
R distance: 0.993631


# Set Transformer:
### Heuristics 
Full transformer network that takes in pages and outputs an order, the architecture for this set-to-sequence model comes from [Set Transformer](https://arxiv.org/abs/1810.00825).

DRAWING