# Imports

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# global imports (1 sec - 30 secs)
import random
import numpy as np
import torch
from accelerate import Accelerator
from sentence_transformers import SentenceTransformer
accelerator = Accelerator()
model = accelerator.prepare(SentenceTransformer("all-mpnet-base-v2", device="cuda")) #all-MiniLM-L6-v2
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
# Get the max token size
max_token_size = model.max_seq_length
print(f"Max token size: {max_token_size}")

Max token size: 384


In [27]:
# local imports (1 sec - 2 secs)
import loader
import distances
import algos
import data_processing as dp
import classifier

In [28]:
torch.cuda.is_available()

True

In [29]:
#clear all memory of the GPU
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.reset_peak_memory_stats(device)

# Database

The miscellaneous database consists of 20 copyright-free book from children's literature obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/bookshelf/20).  
The Tom Swift database consists of 27 books of the series Tom Swift by Victor Appleton obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/search/?query=victor+appleton&submit_search=Go%21).

In [30]:
#Choosing the database ("Miscellaneous", "Tom Swift") and the number of books to load (1 min)
nb_books = 25
mode = "chunks"
all_sentences = loader.load_books("Tom Swift", mode, max_token_size, tokenizer=model.tokenizer)
#Randomly reorder the books
random.shuffle(all_sentences)
sentences = all_sentences[:nb_books]

In [31]:
#choose a random book and print 1 chunk
r = random.randrange(0, len(sentences)) #not included
r2 = random.randrange(0, len(sentences[r]))
for i in range(r2, r2+1):
    print(sentences[r][i])

district where our men were working, and now the privilege, or concession, has been withdrawn. I'm going down to see if I can't get it back. And I want you to go with me." "And I came here for very nearly the same thing," went on Mr. Titus. "That is where the coincidence comes in. It is strange that we should both appeal to Mr. Swift at the same time." "Well, Tom's a valuable helper!" exclaimed Mr. Damon. "I know him of old, for I've been on many a trip with him." "This is the first time I have had the pleasure of meeting him," resumed the tunnel contractor, "but I have heard of him. I did not ask him to go to South America for us. I only wanted to get some superior explosive for my brother, who is in charge of driving the railroad tunnel through a spur of the Andes. I look after matters up North here, but I may have to go to Peru myself. "As I told Mr. Swift, I had read of his invention of the giant cannon and the special powder he used in it to send a projectile such a distance. The 

# Embeddings
Here we use Sentence-BERT to embed the sentences in the database.

In [32]:
#embeddings of the sentences (1 min - 2 min)
sentence_embedding = [model.encode(sentences[r]) for r in range(len(sentences))] #cannot stack because different number of pages

# Minimizing the distances
Here we want to find an oprtimal order by maximizing the semantic proximity between neighboring sentences. We have $n!$ possible orderings, so we can't use brute force. Our problem is similar to the traveling salesman problem, which is NP-hard, so we can't solve it optimally, therefore we design some algorithms to try to find a local minimum instead.

In [None]:
#run the local minimum algorithms on a subset of the sentences of the book and compare the permutation distances (1 sec - 4 secs)
pairwise_dist = distances.pairwise_dist(sentence_embedding)
distances2 = pairwise_dist[0][:100,:100]

default_order, random_order = list(range(len(distances2))), np.random.permutation(len(distances2))

for algo in [(lambda x, y: y), algos.insertion_sort, algos.greedy_sort]:
    for order in [default_order, random_order]:
        for dist, dist_name in [(lambda o : distances.avg_consecutive_dist(o, distances2), "avg_consecutive_dist"), (lambda o : distances.avg_swap_dist(o, default_order), "avg_swap_dist"), (lambda o : distances.avg_R_dist(o, default_order), "avg_R_dist")]:
            ordered = algo(distances2, order)
            d = dist(order)
            print(f"{algo.__name__:15}{dist_name:25}{d}")
        print()

<lambda>       avg_consecutive_dist     0.31866154074668884
<lambda>       avg_swap_dist            0.0
<lambda>       avg_R_dist               0.0

<lambda>       avg_consecutive_dist     0.43418964743614197
<lambda>       avg_swap_dist            0.249
<lambda>       avg_R_dist               0.99

insertion_sort avg_consecutive_dist     0.31866154074668884
insertion_sort avg_swap_dist            0.0
insertion_sort avg_R_dist               0.0

insertion_sort avg_consecutive_dist     0.43418964743614197
insertion_sort avg_swap_dist            0.249
insertion_sort avg_R_dist               0.99

greedy_sort    avg_consecutive_dist     0.31866154074668884
greedy_sort    avg_swap_dist            0.0
greedy_sort    avg_R_dist               0.0

greedy_sort    avg_consecutive_dist     0.43418964743614197
greedy_sort    avg_swap_dist            0.249
greedy_sort    avg_R_dist               0.99



The first algorithm improves significantly the swap distance compared to a random permutation.

# Classifier: 
### Heuristics
The two metrics discussed were the following: distance betweeen pages or probability that a page is before another.

For the distance between pages, we don't have the ground truth so to train a model it's easier to start with a classifier that takes in two pages and outputs a probability that the first page is before the second.

![Classifier Architecture](../img/Classifier_architecture.png)

### Training and Testing Datasets

In [None]:
#split the sentences into discard, training, validation and testing sets keeping the order
print(sentence_embedding[0].shape)
sentences_train, sentences_val, sentences_test = dp.split_sentences(sentence_embedding, 1)
print(sentences_train[0].shape, sentences_val[0].shape, sentences_test[0].shape)

(279, 768)
torch.Size([223, 768]) torch.Size([28, 768]) torch.Size([28, 768])


In [None]:
#create the database of the pairs of a subset of sentences (30 sec - 1 min for 25 books and 100%)
X_train, y_train = dp.create_database(sentences_train)
X_val, y_val = dp.create_database(sentences_val)
X_test, y_test = dp.create_database(sentences_test)
print(X_train.shape, X_val.shape, X_test.shape)     #size n x (n-1)

torch.Size([864198, 1536]) torch.Size([13386, 1536]) torch.Size([13204, 1536])


### PyTorch Classifier

In [None]:
#hyperparameters
input_dim = X_train[0].shape[0]
output_dim = 1
hidden_dim = 128
learning_rate_list = [0.1, 0.01, 0.001, 0.0001]
epochs_list = [10, 100, 1000]
L2_alphas = [0, 0.01, 0.001, 0.0001]

In [None]:
#Create the classifier
network = classifier.Classifier(input_dim, hidden_dim, output_dim, accelerator)

The best values for the hyperparameters were found to be:
- Number of epochs: 10 000
- Learning Rate: 0.001
- L2 Regularization: 0

In [None]:
#set the best hyperparameters
learning_rate = 0.001
epochs = 1000
L2_alpha = 0

In [None]:
#train the network with the best hyperparameters (1 mins - 1.5 min)
loss = network.train(X_train, y_train, X_val, y_val, epochs, learning_rate, L2_alpha, True)

Epoch 0: train loss 0.693072
Epoch 100: train loss 0.535311
Epoch 200: train loss 0.494067
Epoch 300: train loss 0.448021
Epoch 400: train loss 0.401572
Epoch 500: train loss 0.361061
Epoch 600: train loss 0.325222
Epoch 700: train loss 0.293390
Epoch 800: train loss 0.267515
Epoch 900: train loss 0.242591
Validation loss 0.560866


In [None]:
#test on the GPU
y_pred = network(X_test.float().cuda())
#BCE loss
loss = network.loss_fn()(y_pred, y_test.float().cuda())
#convert to numpy arrays and round predictions
y_test_array, y_pred_array = y_test.cpu().detach().numpy(), y_pred.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y_test_array, y_pred_array)
#F1 score
F1 = f1_score(y_test_array, y_pred_array)
#AUC score
AUC = roc_auc_score(y_test_array, y_pred_array)

print("Number of values %d" % y_test_array.shape[0])
print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Number of values 13204
Test BCE loss 0.480835
Test accuracy 0.787034
Test F1 0.787998
Test AUC 0.787034


### Exploiting the classifier

In [None]:
#embed a new book (2.5 sec)
new_embedding = [model.encode(all_sentences[nb_books])]

In [None]:
#predict the pairwise page order in the new book using the network (2 secs)
reduced_embedding = dp.split_sentences(new_embedding, 1, 1)[0]
X2, y2 = dp.create_database(reduced_embedding)
y_pred2 = network(X2.float().cuda())

In [None]:
#BCE loss
loss = network.loss_fn()(y_pred2, y2.float().cuda())
#convert to numpy arrays and round predictions
y2_array, y_pred2_array = y2.cpu().detach().numpy(), y_pred2.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y2_array, y_pred2_array)
#F1 score
F1 = f1_score(y2_array, y_pred2_array)
#AUC score
AUC = roc_auc_score(y2_array, y_pred2_array)

print("Number of values %d" % y2_array.shape[0])
print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Number of values 16512
Test BCE loss 0.876174
Test accuracy 0.576672
Test F1 0.564323
Test AUC 0.576672


In [None]:
#convert the predictions to a pairwise probability matrix
pairwise_probabilities = dp.pred_to_pairwise(y_pred2)
#compute the min weight transitivity closure of the pairwise order graph using Floyd-Warshall
#compute obtimal order the topological sort algorithm
pred_order = dp.weighted_transitivity_closure(pairwise_probabilities)

In [None]:
#compare with the actual order of masked pages
ground_truth_order = list(range(len(new_embedding[0])))
print("Swap distance: %f" % distances.avg_swap_dist(pred_order, ground_truth_order))
print("R distance: %f" % distances.avg_R_dist(pred_order, ground_truth_order))

Swap distance: 0.031210
R distance: 0.895652


# Set Transformer:
### Heuristics 
Full transformer network that takes in pages and outputs an order, the architecture for this set-to-sequence model comes from [Set Transformer](https://arxiv.org/abs/1810.00825).

DRAWING