#### Running Imports

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer
from packaging.version import Version
from sklearn.model_selection import train_test_split
import io, sys, os, datetime
import numpy as np
import re

#### Data Cleaning Functions and Configs

In [None]:

# CLEANING CONFIG
DELETE_SECTIONS = [
    "References",
    "External links",
    "See also",
    "Further reading",
    "Notes",
    "Bibliography",
    "Sources",
]
CLEANING_LEVEL = 1
"""
    CLEANING LEVELS:
        0: No cleaning
        1: Clean All Headings
        2: Delete All Headings
        3: Delete Sections only
        4: Delete Selected Sections and Clean Headings
        5: Delete Selected Sections and Delete All Headings
    RECOMMENDED: 
        0: No Cleaning
        1: Light Cleaning
        4: Heavy Cleaning
"""
# END CONFIG

## UTILITY FUNCTIONS TO MAKE CONFIG USABLE


def _heading_clean(x):
    """
    internal function to clean headings and make them lowercase so that comparisons can be performed
    """
    try:
        return x.replace(" ", "").lower()
    except Exception as e:
        print("ERROR at clean_heading function:", e)
        return x


DELETE_SECTIONS = list(map(_heading_clean, DELETE_SECTIONS))

## END UTILITY FUNCTIONS

## CLEANER FUNCTIONS
RE_HEADINGS = re.compile(r"==.*?==+", re.MULTILINE)


def clean_headings(x, DEL_HEADINGS=False):
    """
    Function to remove unwanted characters from headings
    Configurable to remove headings or not via DELETE_HEADINGS.
    Configurable to clean headings or not via CLEAN_HEADINGS.
    Warning: This function will remove all headings from the text. Please run only after deleting unwanted sections.
    """
    if DEL_HEADINGS:
        return RE_HEADINGS.sub("", x)
    else:
        return (
            x.replace("==== ", "")
            .replace("=== ", "")
            .replace("== ", "")
            .replace(" ====", "")
            .replace(" ===", "")
            .replace(" ==", "")
        )


def remove_sections(x):
    """
    Function to remove unwanted sections from the text
    Configurable via DELETE_SECTIONS
    """
    r = RE_HEADINGS.finditer(x)
    sections = [(m.start(0), m.end(0)) for m in r]
    s = []
    for i, sec in enumerate(sections):
        secname = x[sec[0] : sec[1]].replace("=", "").replace(" ", "").lower()
        if secname in DELETE_SECTIONS:
            sb = sec[0]
            try:
                se = sections[i + 1][0]
            except IndexError:
                se = len(x)
            s.append(x[sb:se])
    for sec in s:
        x = x.replace(sec, "")
    return x


def clean(x):
    """
    Function to clean the text
    CLEANING LEVELS:
        0: No cleaning
        1: Clean All Headings
        2: Delete All Headings
        3: Delete Sections only
        4: Delete Selected Sections and Clean Headings
        5: Delete Selected Sections and Delete All Headings
    RECOMMENDED:
        0: No Cleaning
        1: Light Cleaning
        4: Heavy Cleaning
    """
    if CLEANING_LEVEL == 0:
        return x
    elif CLEANING_LEVEL == 1:
        return clean_headings(x)
    elif CLEANING_LEVEL == 2:
        return clean_headings(x, DEL_HEADINGS=True)
    elif CLEANING_LEVEL == 3:
        return remove_sections(x)
    elif CLEANING_LEVEL == 4:
        return clean_headings(remove_sections(x))
    elif CLEANING_LEVEL == 5:
        return clean_headings(remove_sections(x), DEL_HEADINGS=True)
    else:
        raise Exception("Invalid CLEANING_LEVEL configured. Please check the config.")

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    devicename = "[" + torch.cuda.get_device_name(0) + "]"
else:
    device = torch.device("cpu")
    devicename = ""

print("Using PyTorch version:", torch.__version__, "Device:", device, devicename)
assert Version(torch.__version__) >= Version("1.0.0"), "Please install PyTorch version >= 1.0.0"

In [None]:

if "DATADIR" in os.environ:
    DATADIR = os.environ["DATADIR"]
else:
    DATADIR = os.getcwd()

TEXT_DATA_DIR = os.path.join(DATADIR, "TEST")

print("Processing text dataset")

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        print(labels_index)
        for fname in sorted(os.listdir(path)):
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {"encoding": "latin-1"}
                with open(fpath, **args) as f:
                    t = f.read()
                    t = clean(t)
                    texts.append(t)
                labels.append(label_id)

print("Found %s texts." % len(texts))

In [None]:
print(len(labels))
print(labels_index)

In [None]:

TEST_SET = int(len(texts)*0.10)   # 10% of the data for testing

(sentences_train, sentences_test,
 labels_train, labels_test) = train_test_split(texts, labels, test_size=TEST_SET, shuffle=True, random_state=42)

print('Length of training texts:', len(sentences_train))
print('Length of training labels:', len(labels_train))
print('Length of test texts:', len(sentences_test))
print('Length of test labels:', len(labels_test))


In [None]:

sentences_train = ["[CLS] " + s for s in sentences_train]
sentences_test = ["[CLS] " + s for s in sentences_test]
# sentences_validation = ["[CLS] " + s for s in sentences_validation]
print ("The first training sentence:")
print(sentences_train[0], 'LABEL:', labels_train[0])


In [None]:
print('Initializing BertTokenizer')

BERTMODEL='bert-base-uncased'
CACHE_DIR=os.path.join(DATADIR, 'transformers-cache')

tokenizer = BertTokenizer.from_pretrained(BERTMODEL, cache_dir=CACHE_DIR, do_lower_case=True)

In [None]:
tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]

print ("The full tokenized first training sentence:")
print (tokenized_train[0])


In [None]:
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512

tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("The truncated tokenized first training sentence:")
print (tokenized_train[0])

In [None]:
# Next we use the BERT tokenizer to convert each token into an integer
# index in the BERT vocabulary. We also pad any shorter sequences to
# `MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros.

ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), mode='constant') for i in ids_train])

ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)), mode='constant') for i in ids_test])

print ("The indices of the first training sentence:")
print (ids_train[0])

In [None]:
# BERT also requires *attention masks*, with 1 for each real token in
# the sequences and 0 for the padding:

amasks_train, amasks_test = [], []

for seq in ids_train:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)

for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

In [None]:
# We use again scikit-learn's train_test_split to use 10% of our
# training data as a validation set, and then convert all data into
# torch.tensors.

(train_inputs, validation_inputs,train_labels, validation_labels) = train_test_split(ids_train, labels_train, random_state=42, test_size=0.1)
(train_masks, validation_masks, _, _) = train_test_split(amasks_train, ids_train, random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)

In [None]:
# Next we create PyTorch DataLoaders for all data sets.
#
# For fine-tuning BERT on a specific task, the authors recommend a
# batch size of 16 or 32.

BATCH_SIZE = 32

print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
print(len(train_data), 'messages')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)
print(len(validation_data), 'messages')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'messages')

References:

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

https://blog.floydhub.com/tokenization-nlp/

https://pytorch.org/hub/huggingface_pytorch-transformers/

https://pytorch.org/hub/huggingface_pytorch-transformers/


