# SumPubMed Dataset Creation
The produced Dataset will have the following structure:
* Each document (text+abstract) entry is saved in a different file (e.g. data_{i}.txt)
* Each document has the following structure: [CLS] text [SEP] abstract [SEP]
* The labels of the documents are saved in a single file (e.g. labels.txt), where each line corresponds to the document with id equal to the line number.

### Used only for Google Colab
Run before everything

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')
# 
# import zipfile
# zip = zipfile.ZipFile('/content/drive/MyDrive/Colab Datasets/SumPubMed/sumpubmed.zip')
# zip.extractall('/tmp')
# zip.close()

### Used only for Google Colab
Run after everything

In [8]:
# %cd /tmp
# !zip -r data.zip data
# !cp /tmp/data.zip '/content/drive/MyDrive/Colab Datasets/SumPubMed/'
# %cd /content

In [9]:
import os
import random
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

## Dataset parameters

In [10]:
num_docs = 32689
num_neg_ex = 2

pos_label = "1"
neg_label = "0"

doc_len = 512
abst_len = 64

test_docs_per = 0.1

sumpubmed_path = '../sumpubmed'
data_path = 'processed_data'

## Utility functions

In [11]:
def read_file(path, seq_length):
    with open(path, encoding="utf8") as f:
        data = f.read()
        f.close()
        words = data.split(" ")
        text = ' '.join((words[:seq_length])) if len(words) > seq_length else ' '.join(words)
    return text

def write_file(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf8") as f:
        f.write(data)
        f.close()

## Dataset construction containing raw data
The SuPubMed Data is split into train and test sets.

Each Dataset is constructed as follows:
1. Each document is written to a separate file.
2. The text and abstract are truncated in order to match the desired lengths.
3. Each document is constructed by concatenating the text and abstract of a paper using special separators ([CLS], [SEP]).
4. Negative examples are produced by picking random abstracts of other documents and creating a new negative entry as described above.
5. Each positive example is copied as many times as needed in order to balance the positive and negative examples.
6. The labels are written into a single document, where each line corresponds to the document with id equal to the line number.

### Train Dataset

In [12]:
doc_count = 1
doc_labels = []
test_start = int(num_docs * (1 - test_docs_per)) + 1
for i in tqdm(range(1, test_start)):
    text = read_file(f"{sumpubmed_path}/line_text/text_{i}.txt", doc_len)
    abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{i}.txt", abst_len)
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    for j in range(num_neg_ex):
        # Write positive example
        write_file(f"{data_path}/raw/train/data_{doc_count}.txt", doc_data)
        doc_labels.append(pos_label)
        doc_count += 1

        # Write negative example
        neg_ex_index = random.choice([k for k in range(1,num_docs+1) if k not in [i]])
        neg_ex_abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{neg_ex_index}.txt", abst_len)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(f"{data_path}/raw/train/data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1

write_file(f"{data_path}/raw/train/labels.txt", "\n".join(doc_labels))

  0%|          | 0/29420 [00:00<?, ?it/s]

### Test Dataset

In [13]:
doc_count = 1
doc_labels = []
for i in tqdm(range(test_start, num_docs+1)):
    text = read_file(f"{sumpubmed_path}/line_text/text_{i}.txt", doc_len)
    abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{i}.txt", abst_len)
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    for j in range(num_neg_ex):
        # Write positive example
        write_file(f"{data_path}/raw/test/data_{doc_count}.txt", doc_data)
        doc_labels.append(pos_label)
        doc_count += 1

        # Write negative example
        neg_ex_index = random.choice([k for k in range(1,num_docs+1) if k not in [i]])
        neg_ex_abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{neg_ex_index}.txt", abst_len)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(f"{data_path}/raw/test/data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1

write_file(f"{data_path}/raw/test/labels.txt", "\n".join(doc_labels))

  0%|          | 0/3269 [00:00<?, ?it/s]

## Preprocessed Dataset construction
Remove the Background section from each paper and only keep the Results and Conclusion section. Usually the Background section provides little insight to what the actual contribution of a peper is. Since the text length is limited, we have decided to remove this section.

In order to ensure that as much information as possible is contained in each data input, we have decided to strip the stop words from the texts.

The rest of the Dataset construction process is similar to the above.

In [14]:
def read_file(path, seq_length, sep):
    with open(path, encoding="utf8") as f:
        data = f.read()
        f.close()
        doc = data[data.find(sep):].replace('RESULTS', '').replace('CONCLUSIONS', '')
        words = doc.split(" ")
        text = ' '.join((words[:seq_length])) if len(words) > seq_length else ' '.join(words)
    return text

sep = "RESULTS"

### Train Dataset

In [15]:
doc_count = 1
doc_labels = []
test_start = int(num_docs * (1 - test_docs_per)) + 1
for i in tqdm(range(1, test_start)):
    text = read_file(f"{sumpubmed_path}/line_text/text_{i}.txt", doc_len, sep)
    abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{i}.txt", abst_len, sep)
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    for j in range(num_neg_ex):
        # Write positive example
        write_file(f"{data_path}/preprocessed/{sep.lower()}/train/data_{doc_count}.txt", doc_data)
        doc_labels.append(pos_label)
        doc_count += 1

        # Write negative example
        neg_ex_index = random.choice([k for k in range(1,num_docs+1) if k not in [i]])
        neg_ex_abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{neg_ex_index}.txt", abst_len, sep)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(f"{data_path}/preprocessed/{sep.lower()}/train/data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1

write_file(f"{data_path}/preprocessed/{sep.lower()}/train/labels.txt", "\n".join(doc_labels))

  0%|          | 0/29420 [00:00<?, ?it/s]

### Test Dataset

In [16]:
doc_count = 1
doc_labels = []
for i in tqdm(range(test_start, num_docs+1)):
    text = read_file(f"{sumpubmed_path}/line_text/text_{i}.txt", doc_len, sep)
    abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{i}.txt", abst_len, sep)
    doc_data = "[CLS] " + text + " [SEP] " + abst + " [SEP]"
    for j in range(num_neg_ex):
        # Write positive example
        write_file(f"{data_path}/preprocessed/{sep.lower()}/test/data_{doc_count}.txt", doc_data)
        doc_labels.append(pos_label)
        doc_count += 1

        # Write negative example
        neg_ex_index = random.choice([k for k in range(1,num_docs+1) if k not in [i]])
        neg_ex_abst = read_file(f"{sumpubmed_path}/shorter_abstract/abst_{neg_ex_index}.txt", abst_len, sep)
        neg_ex_data = "[CLS] " + text + " [SEP] " + neg_ex_abst + " [SEP]"
        write_file(f"{data_path}/preprocessed/{sep.lower()}/test/data_{doc_count}.txt", neg_ex_data)
        doc_labels.append(neg_label)
        doc_count += 1

write_file(f"{data_path}/preprocessed/{sep.lower()}/test/labels.txt", "\n".join(doc_labels))

  0%|          | 0/3269 [00:00<?, ?it/s]