#### Running Imports

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizerFast
from packaging.version import Version
from sklearn.model_selection import train_test_split
import io, sys, os, datetime
import numpy as np
import re

#### Data Cleaning Functions and Configs

In [2]:

# CLEANING CONFIG
DELETE_SECTIONS = [
    "References",
    "External links",
    "See also",
    "Further reading",
    "Notes",
    "Bibliography",
    "Sources",
]
CLEANING_LEVEL = 1
"""
    CLEANING LEVELS:
        0: No cleaning
        1: Clean All Headings
        2: Delete All Headings
        3: Delete Sections only
        4: Delete Selected Sections and Clean Headings
        5: Delete Selected Sections and Delete All Headings
    RECOMMENDED: 
        0: No Cleaning
        1: Light Cleaning
        4: Heavy Cleaning
"""
# END CONFIG

## UTILITY FUNCTIONS TO MAKE CONFIG USABLE


def _heading_clean(x):
    """
    internal function to clean headings and make them lowercase so that comparisons can be performed
    """
    try:
        return x.replace(" ", "").lower()
    except Exception as e:
        print("ERROR at clean_heading function:", e)
        return x


DELETE_SECTIONS = list(map(_heading_clean, DELETE_SECTIONS))

## END UTILITY FUNCTIONS

## CLEANER FUNCTIONS
RE_HEADINGS = re.compile(r"==.*?==+", re.MULTILINE)


def clean_headings(x, DEL_HEADINGS=False):
    """
    Function to remove unwanted characters from headings
    Configurable to remove headings or not via DELETE_HEADINGS.
    Configurable to clean headings or not via CLEAN_HEADINGS.
    Warning: This function will remove all headings from the text. Please run only after deleting unwanted sections.
    """
    if DEL_HEADINGS:
        return RE_HEADINGS.sub("", x)
    else:
        return (
            x.replace("==== ", "")
            .replace("=== ", "")
            .replace("== ", "")
            .replace(" ====", "")
            .replace(" ===", "")
            .replace(" ==", "")
        )


def remove_sections(x):
    """
    Function to remove unwanted sections from the text
    Configurable via DELETE_SECTIONS
    """
    r = RE_HEADINGS.finditer(x)
    sections = [(m.start(0), m.end(0)) for m in r]
    s = []
    for i, sec in enumerate(sections):
        secname = x[sec[0] : sec[1]].replace("=", "").replace(" ", "").lower()
        if secname in DELETE_SECTIONS:
            sb = sec[0]
            try:
                se = sections[i + 1][0]
            except IndexError:
                se = len(x)
            s.append(x[sb:se])
    for sec in s:
        x = x.replace(sec, "")
    return x


def clean(x):
    """
    Function to clean the text
    CLEANING LEVELS:
        0: No cleaning
        1: Clean All Headings
        2: Delete All Headings
        3: Delete Sections only
        4: Delete Selected Sections and Clean Headings
        5: Delete Selected Sections and Delete All Headings
    RECOMMENDED:
        0: No Cleaning
        1: Light Cleaning
        4: Heavy Cleaning
    """
    if CLEANING_LEVEL == 0:
        return x
    elif CLEANING_LEVEL == 1:
        return clean_headings(x)
    elif CLEANING_LEVEL == 2:
        return clean_headings(x, DEL_HEADINGS=True)
    elif CLEANING_LEVEL == 3:
        return remove_sections(x)
    elif CLEANING_LEVEL == 4:
        return clean_headings(remove_sections(x))
    elif CLEANING_LEVEL == 5:
        return clean_headings(remove_sections(x), DEL_HEADINGS=True)
    else:
        raise Exception("Invalid CLEANING_LEVEL configured. Please check the config.")

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    devicename = "[" + torch.cuda.get_device_name(0) + "]"
else:
    device = torch.device("cpu")
    devicename = ""

print("Using PyTorch version:", torch.__version__, "Device:", device, devicename)
assert Version(torch.__version__) >= Version("1.0.0"), "Please install PyTorch version >= 1.0.0"

Using PyTorch version: 1.10.1+cu113 Device: cuda [NVIDIA GeForce GTX 1650]


In [4]:

if "DATADIR" in os.environ:
    DATADIR = os.environ["DATADIR"]
else:
    DATADIR = os.getcwd()

TEXT_DATA_DIR = os.path.join(DATADIR, "TEST")

print("Processing text dataset")

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        print(labels_index)
        for fname in sorted(os.listdir(path)):
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {"encoding": "latin-1"}
                with open(fpath, **args) as f:
                    t = f.read()
                    t = clean(t)
                    texts.append(t)
                labels.append(label_id)

print("Found %s texts." % len(texts))

Processing text dataset
{'B': 0}
{'B': 0, 'C': 1}
{'B': 0, 'C': 1, 'FA': 2}
{'B': 0, 'C': 1, 'FA': 2, 'GA': 3}
{'B': 0, 'C': 1, 'FA': 2, 'GA': 3, 'Start': 4}
{'B': 0, 'C': 1, 'FA': 2, 'GA': 3, 'Start': 4, 'Stub': 5}
Found 3244 texts.


In [5]:
print(len(labels))
print(labels_index)

3244
{'B': 0, 'C': 1, 'FA': 2, 'GA': 3, 'Start': 4, 'Stub': 5}


In [6]:

TEST_SET = int(len(texts)*0.10)   # 10% of the data for testing

(sentences_train, sentences_test,
 labels_train, labels_test) = train_test_split(texts, labels, test_size=TEST_SET, shuffle=True, random_state=42)

print('Length of training texts:', len(sentences_train))
print('Length of training labels:', len(labels_train))
print('Length of test texts:', len(sentences_test))
print('Length of test labels:', len(labels_test))


Length of training texts: 2920
Length of training labels: 2920
Length of test texts: 324
Length of test labels: 324


In [7]:

sentences_train = ["[CLS] " + s for s in sentences_train]
sentences_test = ["[CLS] " + s for s in sentences_test]

print ("The first training sentence:")
print(sentences_train[0], 'LABEL:', labels_train[0])


The first training sentence:
[CLS] Patrizia Spuri (born 18 February 1973 in Fara in Sabina) is an Italian former sprinter (400 m) and middle distance runner (800 m).
In her career she won 9 times the national championships. She's the wife of the triple jumper Fabrizio Donato.


National records
4x400 metres relay: 3'26"69 ( Paris, 20 June 1999) - with Virna De Angeli, Francesca Carbone, Danielle Perpoli
4x400 metres relay indoor: 3'35"01 ( Ghent, 27 February 2000) - with Virna De Angeli, Francesca Carbone, Carla Barbarino


Achievements


National titles
4 wins in 400 metres at the Italian Athletics Championships (1994, 1996, 1997, 1998)
1 win in 800 metres at the Italian Athletics Championships (1999)
2 wins in 400 metres at the Italian Athletics Indoor Championships (1994, 1998)
1 win in 800 metres at the Italian Athletics Indoor Championships (2000)


See also
Italian all-time top lists - 400 metres
Italian all-time top lists - 800 metres


References


External links
Patrizia Spuri

In [8]:
print('Initializing RoBERTaTokenizer')

ROBERTA_MODEL = 'roberta-base'
CACHE_DIR=os.path.join(DATADIR, 'transformers-cache')

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", cache_dir=CACHE_DIR, do_lower_case=True,max_length=512)


Initializing BertTokenizer


Downloading: 100%|██████████| 878k/878k [00:01<00:00, 778kB/s] 
Downloading: 100%|██████████| 446k/446k [00:01<00:00, 400kB/s]  
Downloading: 100%|██████████| 1.29M/1.29M [00:01<00:00, 815kB/s] 
Downloading: 100%|██████████| 481/481 [00:00<00:00, 240kB/s]


In [9]:
tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]

print ("The full tokenized first training sentence:")
print (tokenized_train[0])


Token indices sequence length is longer than the specified maximum sequence length for this model (6894 > 512). Running this sequence through the model will result in indexing errors


The full tokenized first training sentence:
['[', 'CL', 'S', ']', 'ĠPatri', 'z', 'ia', 'ĠSp', 'uri', 'Ġ(', 'born', 'Ġ18', 'ĠFebruary', 'Ġ1973', 'Ġin', 'ĠF', 'ara', 'Ġin', 'ĠSab', 'ina', ')', 'Ġis', 'Ġan', 'ĠItalian', 'Ġformer', 'Ġspr', 'inter', 'Ġ(', '400', 'Ġm', ')', 'Ġand', 'Ġmiddle', 'Ġdistance', 'Ġrunner', 'Ġ(', '800', 'Ġm', ').', 'Ċ', 'In', 'Ġher', 'Ġcareer', 'Ġshe', 'Ġwon', 'Ġ9', 'Ġtimes', 'Ġthe', 'Ġnational', 'Ġchampionships', '.', 'ĠShe', "'s", 'Ġthe', 'Ġwife', 'Ġof', 'Ġthe', 'Ġtriple', 'Ġjumper', 'ĠFab', 'riz', 'io', 'ĠDon', 'ato', '.', 'ĊĊ', 'Ċ', 'National', 'Ġrecords', 'Ċ', '4', 'x', '400', 'Ġmetres', 'Ġrelay', ':', 'Ġ3', "'", '26', '"', '69', 'Ġ(', 'ĠParis', ',', 'Ġ20', 'ĠJune', 'Ġ1999', ')', 'Ġ-', 'Ġwith', 'ĠVir', 'na', 'ĠDe', 'ĠAngel', 'i', ',', 'ĠFrances', 'ca', 'ĠCar', 'bone', ',', 'ĠDanielle', 'ĠPer', 'p', 'oli', 'Ċ', '4', 'x', '400', 'Ġmetres', 'Ġrelay', 'Ġindoor', ':', 'Ġ3', "'", '35', '"', '01', 'Ġ(', 'ĠG', 'hent', ',', 'Ġ27', 'ĠFebruary', 'Ġ2000', ')', 'Ġ-', 'Ġwith

In [10]:
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512

tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("The truncated tokenized first training sentence:")
print (tokenized_train[0])

The truncated tokenized first training sentence:
['[', 'CL', 'S', ']', 'ĠPatri', 'z', 'ia', 'ĠSp', 'uri', 'Ġ(', 'born', 'Ġ18', 'ĠFebruary', 'Ġ1973', 'Ġin', 'ĠF', 'ara', 'Ġin', 'ĠSab', 'ina', ')', 'Ġis', 'Ġan', 'ĠItalian', 'Ġformer', 'Ġspr', 'inter', 'Ġ(', '400', 'Ġm', ')', 'Ġand', 'Ġmiddle', 'Ġdistance', 'Ġrunner', 'Ġ(', '800', 'Ġm', ').', 'Ċ', 'In', 'Ġher', 'Ġcareer', 'Ġshe', 'Ġwon', 'Ġ9', 'Ġtimes', 'Ġthe', 'Ġnational', 'Ġchampionships', '.', 'ĠShe', "'s", 'Ġthe', 'Ġwife', 'Ġof', 'Ġthe', 'Ġtriple', 'Ġjumper', 'ĠFab', 'riz', 'io', 'ĠDon', 'ato', '.', 'ĊĊ', 'Ċ', 'National', 'Ġrecords', 'Ċ', '4', 'x', '400', 'Ġmetres', 'Ġrelay', ':', 'Ġ3', "'", '26', '"', '69', 'Ġ(', 'ĠParis', ',', 'Ġ20', 'ĠJune', 'Ġ1999', ')', 'Ġ-', 'Ġwith', 'ĠVir', 'na', 'ĠDe', 'ĠAngel', 'i', ',', 'ĠFrances', 'ca', 'ĠCar', 'bone', ',', 'ĠDanielle', 'ĠPer', 'p', 'oli', 'Ċ', '4', 'x', '400', 'Ġmetres', 'Ġrelay', 'Ġindoor', ':', 'Ġ3', "'", '35', '"', '01', 'Ġ(', 'ĠG', 'hent', ',', 'Ġ27', 'ĠFebruary', 'Ġ2000', ')', 'Ġ-', '

In [11]:
# Next we use the roBERTa tokenizer to convert each token into an integer
# index in the roBERTa vocabulary. We also pad any shorter sequences to
# `MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros.

ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), mode='constant') for i in ids_train])

ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)), mode='constant') for i in ids_test])

print ("The indices of the first training sentence:")
print (ids_train[0])

The indices of the first training sentence:
[10975  7454   104   742 24835   329   493  2064  6151    36  5400   504
   902 14757    11   274  1742    11  6371  1243    43    16    41  3108
   320 11085  8007    36  4017   475    43     8  1692  4472  7449    36
  3913   475   322 50118  1121    69   756    79   351   361   498     5
   632  8226     4   264    18     5  1141     9     5  6436 16338  8659
 21645  1020  1599  3938     4 50140 50118 18285  2189 50118   306  1178
  4017  7472 12937    35   155   108  2481   113  4563    36  2201     6
   291   502  6193    43   111    19  9541  2133   926  6896   118     6
 11442  3245  1653 18026     6 15156  2595   642  6483 50118   306  1178
  4017  7472 12937 11894    35   155   108  2022   113  2663    36   272
 37754     6   974   902  3788    43   111     3]


In [12]:
# BERT also requires *attention masks*, with 1 for each real token in
# the sequences and 0 for the padding:

amasks_train, amasks_test = [], []

for seq in ids_train:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)

for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

In [13]:
# We use again scikit-learn's train_test_split to use 10% of our
# training data as a validation set, and then convert all data into
# torch.tensors.

(train_inputs, validation_inputs,train_labels, validation_labels) = train_test_split(ids_train, labels_train, random_state=42, test_size=0.1)
(train_masks, validation_masks, _, _) = train_test_split(amasks_train, ids_train, random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)

In [14]:
# Next we create PyTorch DataLoaders for all data sets.
#
# For fine-tuning roBERTa on a specific task, the authors recommend a
# batch size of 16 or 32.

BATCH_SIZE = 32

print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
print(len(train_data), 'messages')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)
print(len(validation_data), 'messages')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'messages')

Train: 2628 messages
Validation: 292 messages
Test: 324 messages


#### References:

https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

https://blog.floydhub.com/tokenization-nlp/

https://jesusleal.io/2020/10/20/RoBERTA-Text-Classification/