In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import wandb
from datasets import load_metric

import numpy as np
import pandas as pd

import json
from tqdm.notebook import tqdm, trange
from pprint import pprint
import random
from collections import Counter, OrderedDict, defaultdict
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mricardmask[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%env WANDB_PROJECT=transformers_devops

env: WANDB_PROJECT=transformers_devops


In [4]:
TARGET_IND2LABEL = {
    0: 'Computer Science',
    1: 'Economics',
    2: 'Electrical Engineering and Systems Science',
    3: 'Mathematics',
    4: 'Physics',
    5: 'Quantitative Biology',
    6: 'Quantitative Finance',
    7: 'Statistics',
}

In [5]:
data = pd.read_csv('data_base.csv')

In [6]:
X_train, X_val, y_train, y_val = train_test_split(data['text'].tolist(),
                                                  data.target.values,
                                                  test_size=0.17, stratify=data.target.values)

In [7]:
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [9]:
train_texts = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
val_texts = tokenizer(X_val, padding=True, truncation=True, return_tensors='pt')

In [10]:
class ArxivDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ArxivDataset(train_texts, y_train)
val_dataset = ArxivDataset(val_texts, y_val)

In [11]:
device = 'cuda:2'

In [12]:
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_classes=8):
        super().__init__()
        self.encoder = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pre_classifier = nn.Linear(768, 768)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask, labels):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output[0]
        pooler = hidden_state[:, 0]
        pooler = self.dropout(self.gelu(self.pre_classifier(pooler)))
        preds = self.classifier(pooler)
        return preds

In [13]:
model = DistilBERTClassifier()
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    length = min(len(labels), len(logits))
    logits = logits[:length]
    labels = labels[:length]
    preds = logits.argmax(axis=-1)
    accuracy = load_accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=preds, references=labels, average='macro')["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [15]:
train_args = TrainingArguments(
    output_dir = "./results_dbb",
    report_to = "wandb",
    run_name = 'base_distilbert_run_2',
    evaluation_strategy = "steps",
    eval_steps = 130,
    logging_dir = './logs_dbb',
    logging_steps = 130,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 16,
    weight_decay = 0.01,
    max_steps = 6500,
    warmup_steps = 500,
    do_train = True,
    do_eval = True,
)

In [16]:
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(outputs, labels)
        return (loss, outputs) if return_outputs else loss

In [17]:
trainer = MyTrainer(
    model = model,
    args = train_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
)

In [18]:
trainer.train()
wandb.finish()

Step,Training Loss,Validation Loss,Accuracy,F1
130,1.3989,0.826184,0.57625,0.128489
260,0.786,0.709333,0.546331,0.138303
390,0.7206,0.673016,0.52163,0.140675
520,0.672,0.670369,0.49039,0.141845
650,0.6806,0.671164,0.48088,0.139944
780,0.6617,0.634803,0.503533,0.141313
910,0.6559,0.616959,0.523281,0.136932
1040,0.6426,0.616798,0.520111,0.140248
1170,0.6175,0.6341,0.485965,0.138582
1300,0.6232,0.632411,0.534971,0.135427


VBox(children=(Label(value='0.009 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.678194…

0,1
eval/accuracy,█▆▅▃▄▅▅▃▅▃▂▃▄▃▃▄▅▄▃▃▅▄▅▃▄▄▃▂▄▃▁▃▃▃▄▃▂▂▃▃
eval/f1,▁▆▇██▅▇▆▅▇▆▇▇▇▇▆▆██▇▄▇▅▆▅▅▆▇▆▇█▇▇▆▇▇█▇▇▇
eval/loss,█▅▄▄▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▂▁▁▁▂▂▂▂▂▂▂▂▂▂▂
eval/runtime,▃▃▄▃▃▄▃▄▄▄▄▂▃▃▃▄▄▄█▂▂▅▂▂▂▂▃▂▁▁▂▃▃▃▃▂▁▂▂▆
eval/samples_per_second,▆▆▅▆▆▅▆▅▅▅▅▇▆▅▆▅▅▅▁▇▇▄▇▇▆▇▆▇██▆▆▆▆▆▆█▆▇▃
eval/steps_per_second,▆▆▅▆▆▅▆▅▅▅▅▇▆▅▆▅▅▅▁▇▇▄▇▇▆▇▆▇██▆▆▆▆▆▇█▆▇▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,▃▅▆███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.49878
eval/f1,0.14072
eval/loss,0.60443
eval/runtime,106.7336
eval/samples_per_second,144.116
eval/steps_per_second,2.258
train/epoch,2.77
train/global_step,6500.0
train/learning_rate,0.0
train/loss,0.4115
