In [2]:
# Load Dataset
import os
import sys
sys.path.append(os.path.abspath('../'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [3]:
# Load Dataset
with open('/content/NoveltyDetectionResearch/.data/dlnd/TAP-DLND-1.0_LREC2018_modified/dlnd.jsonl','r') as f:
    data = f.readlines()
    dataset = [json.loads(line) for line in data]

In [4]:
from sklearn.model_selection import train_test_split

texts=[(i["source"],i["target_text"]) for i in dataset]
labels=[1 if i["DLA"]=='Novel' else 0 for i in dataset]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [5]:
from transformers import DistilBertTokenizerFast,BertTokenizerFast,LongformerTokenizerFast, RobertaTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [6]:
train_encodings = tokenizer(train_texts, truncation=False, padding=True)
val_encodings = tokenizer(val_texts, truncation=False, padding=True)
test_encodings = tokenizer(test_texts, truncation=False, padding=True)

In [15]:

from spacy.lang.en import English # updated
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

def split_sentences(text):
    doc = nlp(text.strip().replace("\n",' . '))
    sentences = list(filter(lambda x: x!=".",[sent.string.strip() for sent in doc.sents]))
    return sentences

In [16]:
def process_text(texts):
    texts_new = []
    for i in train_texts:
        new_i = (tokenizer(split_sentences(i[0]),truncation=True, padding=True),tokenizer(split_sentences(i[1]),truncation=True, padding=True))
        texts_new.append(new_i)
    return texts_new


In [17]:

train_encodings = process_text(train_texts)
val_encodings = process_text(val_texts)
test_encodings = process_text(test_texts)

In [21]:
import torch

class DLNDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        item["source"] = {key: torch.tensor(val[idx]) for key, val in self.encodings[idx][0].items()}
        item["target"] = {key: torch.tensor(val[idx]) for key, val in self.encodings[idx][1].items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [22]:
train_dataset = DLNDDataset(train_encodings, train_labels)
val_dataset = DLNDDataset(val_encodings, val_labels)
test_dataset = DLNDDataset(test_encodings, test_labels)

In [8]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertForSequenceClassification
from transformers import LongformerForSequenceClassification
from transformers import RobertaForSequenceClassification

import numpy as np

from sklearn.metrics import precision_recall_fscore_support,accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_novel': f1[0],
        'P_novel': precision[0],
        'R_novel': recall[0],
        'f1_non_novel': f1[1],
        'P_non_novel': precision[1],
        'R_non_novel': recall[1],
    }

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=15,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps between logging
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps=200,                  # number of steps between evaluations
    gradient_accumulation_steps=2,   # number of steps for gradient accumulation
)

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
# model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to compute metrics
)

trainer.train()

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Step,Training Loss,Validation Loss


In [9]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 870
  Batch size = 16


{'eval_loss': 0.4461168348789215,
 'eval_accuracy': 0.9057471264367816,
 'eval_f1_novel': 0.9072398190045249,
 'eval_P_novel': 0.8950892857142857,
 'eval_R_novel': 0.9197247706422018,
 'eval_f1_non_novel': 0.9042056074766356,
 'eval_P_non_novel': 0.9170616113744076,
 'eval_R_non_novel': 0.8917050691244239,
 'eval_runtime': 15.8479,
 'eval_samples_per_second': 54.897,
 'eval_steps_per_second': 3.47,
 'epoch': 6.0}

In [10]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1087
  Batch size = 16


{'eval_loss': 0.594714879989624,
 'eval_accuracy': 0.8647654093836247,
 'eval_f1_novel': 0.8771929824561403,
 'eval_P_novel': 0.8578431372549019,
 'eval_R_novel': 0.8974358974358975,
 'eval_f1_non_novel': 0.8495394063459569,
 'eval_P_non_novel': 0.8736842105263158,
 'eval_R_non_novel': 0.8266932270916335,
 'eval_runtime': 19.7652,
 'eval_samples_per_second': 54.996,
 'eval_steps_per_second': 3.44,
 'epoch': 6.0}