In [1]:
import copy
import datetime
import random
import time

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import LongformerTokenizer, LongformerForSequenceClassification

#from reddit_preprocessing import MAX_SEQ_LENGTH, AHOLE_CLASSES, load_dataset

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [3]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096", do_lower_case=True)

In [4]:
import string
import re
import pandas as pd

URL_REGEX = '(http|https)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]'
NON_ALPHA_NUMERIC_REGEX = '[^a-zA-Z0-9- ]'
TAG_REGEX = '@[^\\s]*'

NUM_CLASSES = 2
AHOLE_CLASSES = ['yta', 'nta']
MAX_SEQ_LENGTH = 1000


def load_dataset(filepath: str, classes: list, tokenizer, rm_punct: bool = False):
    label2index = {x: i for i, x in enumerate(classes)}

    texts, labels = [], []
    for class_label in classes:
        df = pd.read_csv(filepath.format(class_label), usecols=['body'])
        for text in df['body']:
            # required for empty descriptions in the Tumblr dataset
            try:
                tokens = process_text(text, tokenizer, rm_punct)
                if len(tokens):
                    texts.append(tokens)
                    labels.append(label2index[class_label])
            except TypeError:
                continue

    return texts, labels


def process_text(text, tokenizer, rm_punct: bool = False):
    if type(text) is not str:
        raise TypeError('Text is not of type string')

    # remove special string from the text: URLs and emojis (by encoding and decoding to/from ascii)
    text = remove_urls(text)
    text = remove_tags(text)

    # remove unknown characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # remove all punctuation
    if rm_punct:
        text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = tokenizer.tokenize(text)
    # remove all labels from texts
    tokens = delete_label_word(tokens, 'YTA')
    tokens = delete_label_word(tokens, 'NTA')

    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    return tokens


def remove_non_alphanumeric(text: str) -> str:
    return re.sub(NON_ALPHA_NUMERIC_REGEX, ' ', text)


def remove_urls(text: str) -> str:
    return re.sub(URL_REGEX, ' ', text)


def remove_tags(text: str) -> str:
    return re.sub(TAG_REGEX, ' ', text)


def delete_label_word(words: list, label: str) -> list:
    while label in words:
        words.remove(label)
    return words


In [5]:
texts, labels = load_dataset(filepath='/kaggle/input/reddit/posts.csv',
                             classes=AHOLE_CLASSES,
                             tokenizer=tokenizer,
                             rm_punct=True)

In [6]:
input_ids, attention_masks = [], []
for text in texts:
    encoded_dict = tokenizer.encode_plus(text,
                                         add_special_tokens=False,
                                         truncation=True,
                                         max_length=MAX_SEQ_LENGTH,
                                         pad_to_max_length=True,
                                         return_attention_mask=True,
                                         return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [7]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [8]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

batch_size = 8

train_dataloader = DataLoader(train_dataset,  # The training samples.
                              sampler=RandomSampler(train_dataset),  # Select batches randomly
                              batch_size=batch_size)

validation_dataloader = DataLoader(val_dataset,  # The validation samples.
                                   sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
                                   batch_size=batch_size)

In [9]:
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096",
    num_labels=len(AHOLE_CLASSES),
    output_attentions=False,
    output_hidden_states=False
)

optimizer = AdamW(model.parameters(),
                  lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

In [10]:
epochs = 3

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten().astype(np.float32)
    labels_flat = labels[:,1].flatten().astype(np.float32)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [12]:
# TRAINING
# Tell pytorch to run this model on the GPU.
model.cuda()

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

best_val_acc, best_val_loss = 0, torch.finfo(torch.float32).max

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0
    total_train_accuracy = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 400 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        loss, logits = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels,
                             return_dict=False)

        total_train_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of train sentences, and accumulate it over all batches.
        total_train_accuracy += flat_accuracy(logits, label_ids)

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Report the final accuracy for this train run.
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Training accuracy: {0:.4f}".format(avg_train_accuracy))
    print("  Average training loss: {0:.4f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode -- the dropout layers behave differently during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels,
                                   return_dict=False)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    # save the model with the best accuracy and minimal loss (so we do not save an overfitted model)
    if avg_val_accuracy > best_val_acc and avg_val_loss < best_val_loss:
        best_val_acc = avg_val_accuracy
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model)
        best_epoch = epoch_i

    print("  Validation Acc.: {0:.4f}".format(avg_val_accuracy))
    print("  Validation Loss: {0:.4f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Acc.': avg_train_accuracy * 100,
            'Valid. Loss': avg_val_loss,
            'Valid. Acc.': avg_val_accuracy * 100,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

In [None]:
best_model.save_pretrained('/kaggle/working/')
print('Best fit model saved at epoch %d' % (best_epoch + 1))

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 4)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
import matplotlib.pyplot as plt

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training Loss")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation Loss")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(list(range(1, epochs + 1)))

plt.show()