# BookSum Dataset Creation
The produced Dataset will have the following structure:
* Each document (text+abstract) entry is saved in a different file (e.g. data_{i}.txt)
* Each document has the following structure: [CLS] text [SEP] abstract [SEP]
* The labels of the documents are saved in a single file (e.g. labels.txt), where each line corresponds to the document with id equal to the line number. 

In [1]:
import json
import os
import random

In [2]:
paths = ['alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl',
            'alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl',
            'alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl']
alignments = []
for json_file_path in paths:
    with open(json_file_path, 'r') as file:
        for line in file:
            alignment = json.loads(line)
            alignments.append(alignment)
len(alignments)

12630

## Dataset parameters

In [3]:
#num_docs = len(alignments)  # 12630
num_neg_ex = 2

pos_label = "1"
neg_label = "0"

doc_len = 100000000#2000
abst_len = 1000000#500

test_docs_per = 0.1

In [4]:
dataset_characteristics = "whole_text"
train_path = f"data/booksum/data_{dataset_characteristics}/raw/train/"
test_path  = f"data/booksum/data_{dataset_characteristics}/raw/test/"

## Utility functions

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(word_list):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in word_list if word.lower() not in stop_words]
    return filtered_words

In [6]:
def read_file(path, seq_length):
    with open(path, encoding="utf8") as f:
        data = f.read()
        f.close()
        words = data.split(" ")
        words = remove_stopwords(words)
        text = ' '.join((words[:seq_length])) if len(words) > seq_length else ' '.join(words)
    return text

def read_json_file(path, seq_length):
    with open(path, encoding="utf8") as f:
        json_string = f.read()
        data = json.loads(json_string)

        f.close()
        words = data["summary"].split(" ")
        words = remove_stopwords(words)
        text = ' '.join((words[:seq_length])) if len(words) > seq_length else ' '.join(words)
    return text
    
def write_file(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf8") as f:
        f.write(data)
        f.close()

## Dataset construction containing raw data
The BookSum Data is split into train and test sets.

Each Dataset is constructed as follows:
1. Each document is written to a separate file.
2. The text and abstract are truncated in order to match the desired lengths.
3. Each document is constructed by concatenating the text and abstract of a paper using special separators ([CLS], [SEP]).
4. Negative examples are produced by picking random abstracts of other documents and creating a new negative entry as described above.
5. Each positive example is copied as many times as needed in order to balance the positive and negative examples.
6. The labels are written into a single document, where each line corresponds to the document with id equal to the line number. 

There are 12630 alignment pairs (chapter-summary) but we do not have summaries for all chapters due to limitation in scraping the provided links. We have gathered 4843 chapters.

### Get all summary paths for negative examples

In [7]:
folder_a_path = 'scripts/finished_summaries'
all_file_names = []

for root, dirs, files in os.walk(folder_a_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            all_file_names.append(file_path)
len(all_file_names)

4843

In [8]:
clear_alignments = []
summaries_not_available = 0
for alignment in alignments:
    summary_file = "scripts/" + alignment["summary_path"]
    summary_file = summary_file.replace(':','_')
    try:
        with open(summary_file, encoding="utf8") as f:
            json_string = f.read()
    except FileNotFoundError:
        summaries_not_available += 1
        continue
    clear_alignments.append(alignment)

print(len(alignments))
print(len(clear_alignments))    # some summaries refer to the same chapter
print(summaries_not_available)

12630
4145
8485


In [9]:
num_docs = len(clear_alignments)

### Train Dataset

In [10]:
doc_count = 1
doc_labels = []
test_start = int(num_docs * (1 - test_docs_per)) + 1

for alignment in clear_alignments[:test_start]:
    chapter_file = alignment["chapter_path"]
    summary_file = "scripts\\" + alignment["summary_path"]

    chapter_file = chapter_file.replace(':','_')    # ':' is an invalid character for filename
    summary_file = summary_file.replace(':','_')

    text = read_file(chapter_file, doc_len)
    try:
        abst = read_json_file(summary_file, abst_len)
    except FileNotFoundError:
        continue
    
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    doc_label = pos_label

    for j in range(num_neg_ex):
        # Write positive example
        write_file(train_path + f"data_{doc_count}.txt", doc_data)
        doc_labels.append(doc_label)
        doc_count += 1
        
        # Write negative example
        random_element = random.choice(all_file_names)
        while random_element == summary_file:
            random_element = random.choice(all_file_names)

        neg_ex_abst = read_json_file(random_element, abst_len)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(train_path + f"data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1

write_file(train_path + f"labels.txt", "\n".join(doc_labels))

### Test Dataset

In [11]:
doc_count = 1
doc_labels = []

for alignment in clear_alignments[test_start:]:
    chapter_file = alignment["chapter_path"]
    summary_file = "scripts/" + alignment["summary_path"]

    chapter_file = chapter_file.replace(':','_')    # ':' is an invalid character for filename
    summary_file = summary_file.replace(':','_')

    text = read_file(chapter_file, doc_len)
    try:
        abst = read_json_file(summary_file, abst_len)
    except FileNotFoundError:
        continue
    
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    doc_label = pos_label

    for j in range(num_neg_ex):
        # Write positive example
        write_file(test_path + f"data_{doc_count}.txt", doc_data)
        doc_labels.append(doc_label)
        doc_count += 1

        # Write negative example
        random_element = random.choice(all_file_names)
        while random_element == summary_file:
            random_element = random.choice(all_file_names)

        neg_ex_abst = read_json_file(random_element, abst_len)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(test_path + f"data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1
        
write_file(test_path + f"labels.txt", "\n".join(doc_labels))