# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# global imports (1 min)
import random
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# local imports 
import loader
import distances
import algos
import data_processing as dp
import classifier

# Database

The miscellaneous database consists of 20 copyright-free book from children's literature obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/bookshelf/20).  
The Tom Swift database consists of 27 books of the series Tom Swift by Victor Appleton obtained from [The Project Gutenberg](https://www.gutenberg.org/ebooks/search/?query=victor+appleton&submit_search=Go%21).

In [4]:
#Choosing the database ("Miscellaneous", "Tom Swift") and the number of books to load
nb_books = 3
all_paragraphs, all_sentences = loader.load_books("Tom Swift")
paragraphs, sentences = all_paragraphs[:nb_books], all_sentences[:nb_books]

In [5]:
#choose a random book and print 10 of its sentences
r = random.randrange(0, len(sentences)) #not included
for i in range(200, 210):
    print(sentences[r][i])

way.
"Burn the house, boys!" cried their officer; and this would be flashed
on the screen later as a lead.
The dwelling, which had been purchased with the right to burn it, was
set afire, and then began a scene that satisfied even the exacting
producer. Great clouds of smoke rolled out, most of it coming from
specially prepared bombs, and amid them and the red fire, which
simulated flames, could be seen the Union leader carrying out his
sweetheart, Birdie Lee.
Blake and Joe ground away at their cameras, faithfully recording the


# Embeddings
Here we use Sentence-BERT to embed the sentences in the database.

In [6]:
#embeddings of the sentences (1 min)
sentence_embedding = [model.encode(sentences[r]) for r in range(len(sentences))] #cannot stack because different number of pages

# Minimizing the distances
Here we want to find an oprtimal order by maximizing the semantic proximity between neighboring sentences. We have $n!$ possible orderings, so we can't use brute force. Our problem is similar to the traveling salesman problem, which is NP-hard, so we can't solve it optimally, therefore we design some algorithms to try to find a local minimum instead.

In [7]:
#run the local minimum algorithms on a subset of the sentences of the book and compare the permutation distances
pairwise_dist = distances.pairwise_dist(sentence_embedding)
distances2 = pairwise_dist[0][:100,:100]

default_order, random_order = list(range(len(distances2))), np.random.permutation(len(distances2))

for algo in [(lambda x, y: y), algos.greedy_add, algos.greedy_sort]:
    for order in [default_order, random_order]:
        for dist, dist_name in [(lambda o : distances.avg_consecutive_dist(o, distances2), "avg_consecutive_dist"), (lambda o : distances.avg_swap_dist(o, default_order), "avg_swap_dist"), (lambda o : distances.avg_R_dist(o, default_order), "avg_R_dist")]:
            ordered = algo(distances2, order)
            d = dist(order)
            print(f"{algo.__name__:15}{dist_name:25}{d}")
        print()

<lambda>       avg_consecutive_dist     0.758708119392395
<lambda>       avg_swap_dist            0.0
<lambda>       avg_R_dist               0.0

<lambda>       avg_consecutive_dist     0.8512262105941772
<lambda>       avg_swap_dist            0.26899999999999996
<lambda>       avg_R_dist               0.97

greedy_add     avg_consecutive_dist     0.758708119392395
greedy_add     avg_swap_dist            0.0
greedy_add     avg_R_dist               0.0

greedy_add     avg_consecutive_dist     0.8512262105941772
greedy_add     avg_swap_dist            0.26899999999999996
greedy_add     avg_R_dist               0.97

greedy_sort    avg_consecutive_dist     0.758708119392395
greedy_sort    avg_swap_dist            0.0
greedy_sort    avg_R_dist               0.0

greedy_sort    avg_consecutive_dist     0.8512262105941772
greedy_sort    avg_swap_dist            0.26899999999999996
greedy_sort    avg_R_dist               0.97



The first algorithm improves significantly the swap distance compared to a random permutation.

# Classifier: 
### Heuristics
The two metrics discussed were the following: distance betweeen pages or probability that a page is before another
For the distance between pages, we don't have the ground truth so to train a model it's easier to start with a classifier that takes in two pages and outputs a probability that the first page is before the second.

![Classifier Architecture](../img/Classifier_architecture.png)

### Training and Testing Datasets

In [8]:
#split the sentences into discard, training, validation and testing sets keeping the order
print(sentence_embedding[0].shape)
sentences_train, sentences_val, sentences_test = dp.split_sentences(sentence_embedding)
print(sentences_train[0].shape, sentences_val[0].shape, sentences_test[0].shape)

(4036, 384)
torch.Size([322, 384]) torch.Size([41, 384]) torch.Size([40, 384])


In [9]:
#create the database of the pairs of a subset of sentences (45 secs for 3 books)
X_train, y_train = dp.create_database(sentences_train)
X_val, y_val = dp.create_database(sentences_val)
X_test, y_test = dp.create_database(sentences_test)
print(X_train.shape, X_val.shape, X_test.shape)     #size n x (n-1)

  sentence_embeddings = torch.tensor(sentence_embeddings).to("cuda")


torch.Size([321150, 768]) torch.Size([5092, 768]) torch.Size([4848, 768])


### PyTorch Classifier

In [10]:
#hyperparameters
input_dim = X_train[0].shape[0]
output_dim = 1
hidden_dim = 128
learning_rate_list = [0.1, 0.01, 0.001, 0.0001]
epochs_list = [10, 100, 1000]
L2_alphas = [0, 0.01, 0.001, 0.0001]

In [11]:
#Create the classifier
network = classifier.Classifier(input_dim, hidden_dim, output_dim)

The best values for the hyperparameters were found to be:
- Number of epochs: 10 000
- Learning Rate: 0.001
- L2 Regularization: 0

In [12]:
#set the best hyperparameters
learning_rate = 0.001
epochs = 1000
L2_alpha = 0

In [13]:
#train the network with the best hyperparameters (4 mins)
loss = network.train(X_train, y_train, X_val, y_val, epochs, learning_rate, L2_alpha, True)

Epoch 0: train loss 0.693549
Epoch 100: train loss 0.541392
Epoch 200: train loss 0.481243
Epoch 300: train loss 0.369160
Epoch 400: train loss 0.255206
Epoch 500: train loss 0.183871
Epoch 600: train loss 0.140923
Epoch 700: train loss 0.113037
Epoch 800: train loss 0.093587
Epoch 900: train loss 0.079201
Validation loss 1.942119


In [14]:
#test on the GPU
y_pred = network(X_test.float().cuda())
#BCE loss
loss = network.loss_fn()(y_pred, y_test.float().cuda())
#convert to numpy arrays and round predictions
y_test_array, y_pred_array = y_test.cpu().detach().numpy(), y_pred.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y_test_array, y_pred_array)
#F1 score
F1 = f1_score(y_test_array, y_pred_array)
#AUC score
AUC = roc_auc_score(y_test_array, y_pred_array)

print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Test BCE loss 1.632459
Test accuracy 0.626238
Test F1 0.624534
Test AUC 0.626238


### Exploiting the classifier

In [15]:
#load a new book
new_embedding = [model.encode(all_sentences[nb_books])]

In [16]:
#compute the pairwise page order in the new book using the network (20 secs)
reduced_embedding = dp.split_sentences(new_embedding, 0.1, 1)[0]
X2, y2 = dp.create_database(reduced_embedding)
y_pred2 = network(X2.float().cuda())

  sentence_embeddings = torch.tensor(sentence_embeddings).to("cuda")


In [17]:
#BCE loss
loss = network.loss_fn()(y_pred2, y2.float().cuda())
#convert to numpy arrays and round predictions
y2_array, y_pred2_array = y2.cpu().detach().numpy(), y_pred2.cpu().detach().numpy().round()
#accuracy
accuracy = accuracy_score(y2_array, y_pred2_array)
#F1 score
F1 = f1_score(y2_array, y_pred2_array)
#AUC score
AUC = roc_auc_score(y2_array, y_pred2_array)
print("Test BCE loss %f" % loss.item())
print("Test accuracy %f" % accuracy)
print("Test F1 %f" % F1.item())
print("Test AUC %f" % AUC.item())

Test BCE loss 2.800079
Test accuracy 0.497549
Test F1 0.499699
Test AUC 0.497549


In [18]:
#compute the permutation distances to the real order
#compute the real order of the masked sentences

# Transformer:
### Heuristics 
Full transformer network that takes in pages as tokens and outputs an order, and the loss function would be a distance between two permutations.

In [19]:
#end-to-end transformer model using swap distance or R-distance as a loss function

In [20]:
#ChatGPT approach:
"""
Ordering the pages of a book using a transformer model can be challenging, especially when dealing with limited token size. Here's how you can create a transformer model for this problem and address the token size limitation:

1. Data Preparation:

    Start by collecting a dataset of books with pages not in order. Each page should be a separate input example, and the pages should be represented as text.

2. Text Tokenization:

    Tokenize the text from each page into smaller units, such as words or subwords, using a tokenizer. Popular tokenization libraries like Hugging Face Transformers provide tokenizers that can handle large texts and split them into tokens without worrying about the token size limitation.

3. Sliding Window Approach:

    As you mentioned, most transformer models have a token size limitation. To overcome this limitation, you can use a sliding window approach. Split each page into overlapping segments or windows of tokens. This will allow you to work with manageable token sizes for your model.

4. Model Architecture:

    You can use a standard transformer architecture for this task. However, you might need to modify it slightly to account for the specific requirements of ordering pages. Your model should be capable of learning the relationships between pages and their optimal order.

5. Training:

    Train your transformer model on the dataset of disordered pages. You can use a contrastive loss function to ensure that the model learns to distinguish between correct and incorrect page orderings. This involves providing pairs of pages where one pair is in the correct order, and another pair is not.

6. Inference:

    When you want to order a book with disordered pages, you can feed the pages into your trained model. The model should provide a probability score or ranking for each possible page order. You can then select the order with the highest score as the predicted correct order.

7. Evaluation:

    To evaluate the model's performance, you can use metrics like mean squared error (MSE) or Kendall's Tau rank correlation to compare the predicted order with the ground truth order.

Keep in mind that this is a challenging NLP task, and it may require a significant amount of data and computational resources. Additionally, your sliding window approach should be carefully designed to minimize information loss while breaking down the text into manageable token-sized chunks.
"""

"\nOrdering the pages of a book using a transformer model can be challenging, especially when dealing with limited token size. Here's how you can create a transformer model for this problem and address the token size limitation:\n\n1. Data Preparation:\n\n    Start by collecting a dataset of books with pages not in order. Each page should be a separate input example, and the pages should be represented as text.\n\n2. Text Tokenization:\n\n    Tokenize the text from each page into smaller units, such as words or subwords, using a tokenizer. Popular tokenization libraries like Hugging Face Transformers provide tokenizers that can handle large texts and split them into tokens without worrying about the token size limitation.\n\n3. Sliding Window Approach:\n\n    As you mentioned, most transformer models have a token size limitation. To overcome this limitation, you can use a sliding window approach. Split each page into overlapping segments or windows of tokens. This will allow you to wor