In [1]:
# Load Dataset
import os
import sys
sys.path.append(os.path.abspath('../'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [2]:
# Load Dataset
with open('/content/NoveltyDetectionResearch/.data/dlnd/TAP-DLND-1.0_LREC2018_modified/dlnd.jsonl','r') as f:
    data = f.readlines()
    dataset = [json.loads(line) for line in data]

In [3]:
from sklearn.model_selection import train_test_split


texts=[(i["source"],i["target_text"]) for i in dataset]
labels=[1 if i["DLA"]=='Novel' else 0 for i in dataset]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)


In [4]:
from transformers import DistilBertTokenizerFast,BertTokenizerFast
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [5]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [6]:
import torch

class DLNDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = DLNDDataset(train_encodings, train_labels)
val_dataset = DLNDDataset(val_encodings, val_labels)
test_dataset = DLNDDataset(test_encodings, test_labels)

In [20]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertForSequenceClassification

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")


from sklearn.metrics import precision_recall_fscore_support,accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=6,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps between logging
    evaluation_strategy="steps",     # evaluation strategy
    eval_steps=100,                  # number of steps between evaluations


)

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # function to compute metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.7025,0.635101,0.695402,0.691502,0.662946,0.722628
200,0.584,0.467809,0.786207,0.789116,0.738854,0.846715
300,0.5917,0.512995,0.728736,0.616883,0.926829,0.462287
400,0.4716,0.440034,0.789655,0.795987,0.734568,0.868613
500,0.4587,0.402007,0.833333,0.823815,0.822816,0.824818
600,0.4612,0.4052,0.801149,0.756681,0.896667,0.654501
700,0.3983,0.439639,0.825287,0.829978,0.768116,0.902676
800,0.3932,0.618965,0.752874,0.659271,0.945455,0.506083
900,0.4324,0.587597,0.797701,0.8159,0.715596,0.948905
1000,0.3044,0.554726,0.855172,0.849282,0.835294,0.863747


***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
***** Running Evaluation *****
  Num examples = 870
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model we

TrainOutput(global_step=2610, training_loss=0.27964266988081266, metrics={'train_runtime': 1651.2596, 'train_samples_per_second': 12.638, 'train_steps_per_second': 1.581, 'total_flos': 5490601503252480.0, 'train_loss': 0.27964266988081266, 'epoch': 6.0})

In [21]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 870
  Batch size = 16


{'eval_loss': 0.7250908017158508,
 'eval_accuracy': 0.8827586206896552,
 'eval_f1': 0.8718592964824121,
 'eval_precision': 0.9012987012987013,
 'eval_recall': 0.8442822384428224,
 'eval_runtime': 15.7752,
 'eval_samples_per_second': 55.15,
 'eval_steps_per_second': 3.486,
 'epoch': 6.0}

In [22]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 1087
  Batch size = 16


{'eval_loss': 0.7534046173095703,
 'eval_accuracy': 0.8767249310027599,
 'eval_f1': 0.867063492063492,
 'eval_precision': 0.9104166666666667,
 'eval_recall': 0.8276515151515151,
 'eval_runtime': 19.6975,
 'eval_samples_per_second': 55.185,
 'eval_steps_per_second': 3.452,
 'epoch': 6.0}