In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from tqdm.notebook import tqdm
from transformers import BertForSequenceClassification, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/title_conference.csv")
df.head()

Unnamed: 0,Title,Conference
0,Innovation in Database Management: Computer Sc...,VLDB
1,High performance prime field multiplication fo...,ISCAS
2,enchanted scissors: a scissor interface for su...,SIGGRAPH
3,Detection of channel degradation attack by Int...,INFOCOM
4,Pinning a Complex Network through the Betweenn...,ISCAS


Заметим, что классы несбалансированы:

In [3]:
df.Conference.value_counts()

ISCAS       864
INFOCOM     515
VLDB        423
WWW         379
SIGGRAPH    326
Name: Conference, dtype: int64

## Encoding labels

In [4]:
possible_labels = df.Conference.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'VLDB': 0, 'ISCAS': 1, 'SIGGRAPH': 2, 'INFOCOM': 3, 'WWW': 4}

In [5]:
df["label"] = df.Conference.replace(label_dict)

## Train and Vallidation Split

In [6]:
from sklearn.model_selection import train_test_split  # noqa: 402

X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=42,
    stratify=df.label.values,
)

df.data_type = ["not_set"] * df.shape[0]

df.loc[X_train, "data_type"] = "train"
df.loc[X_val, "data_type"] = "val"

df.groupby(["Conference", "label", "data_type"]).count()

  df.data_type = ["not_set"] * df.shape[0]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Title
Conference,label,data_type,Unnamed: 3_level_1
INFOCOM,3,train,438
INFOCOM,3,val,77
ISCAS,1,train,734
ISCAS,1,val,130
SIGGRAPH,2,train,277
SIGGRAPH,2,val,49
VLDB,0,train,359
VLDB,0,val,64
WWW,4,train,322
WWW,4,val,57


## BertTokenizer

In [10]:
BERT_MODEL_TYPE = "bert-base-uncased"


def batch_encode_plus(data):
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_TYPE, do_lower_case=True)

    return tokenizer.batch_encode_plus(
        data.Title.values,
        # Sequences will be encoded with th especial tokens relative to their model.
        add_special_tokens=True,
        # Return attention mask according to specific tokenizer.
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,  # Limit just in case.
        return_tensors="pt",  # Return pytorch compatible tensors.
    )


train_data = df[df["data_type"] == "train"]
encoded_data_train = batch_encode_plus(train_data)

val_data = df[df["data_type"] == "val"]
encoded_data_val = batch_encode_plus(val_data)

input_ids_train = encoded_data_train["input_ids"]
attention_masks_train = encoded_data_train["attention_mask"]
labels_train = torch.tensor(train_data.label.values)

input_ids_val = encoded_data_val["input_ids"]
attention_masks_val = encoded_data_val["attention_mask"]
labels_val = torch.tensor(val_data.label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## BERT Pre-trained model

In [12]:
model = BertForSequenceClassification.from_pretrained(
    BERT_MODEL_TYPE,
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False,
)


Downloading (…)"pytorch_model.bin";:   0%|                                                                                                                         | 0.00/440M [00:00<?, ?B/s][A
Downloading (…)"pytorch_model.bin";:   2%|██▋                                                                                                             | 10.5M/440M [00:04<03:02, 2.36MB/s][A
Downloading (…)"pytorch_model.bin";:   5%|█████▎                                                                                                          | 21.0M/440M [00:08<02:59, 2.34MB/s][A
Downloading (…)"pytorch_model.bin";:   7%|███████▉                                                                                                        | 31.5M/440M [00:13<02:56, 2.32MB/s][A
Downloading (…)"pytorch_model.bin";:  10%|██████████▋                                                                                                     | 41.9M/440M [00:18<02:56, 2.26MB/s][A
Downloading (…)"pytorch_model

Downloading (…)"pytorch_model.bin";: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 440M/440M [03:14<00:00, 2.27MB/s][A
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identica

## Data Loaders
Let's combine a dataset and a sampler to data loader that provides an iterable
over the given dataset.

In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler  # noqa: 402

BATCH_SIZE = 3

dataloader_train = DataLoader(
    dataset_train, sampler=RandomSampler(dataset_train), batch_size=BATCH_SIZE
)

dataloader_val = DataLoader(
    dataset_val, sampler=RandomSampler(dataset_val), batch_size=BATCH_SIZE
)

## Optimizer & Scheduler
1. We should define parameters to optimize in iterable.
2. Then specify optmizer-specific options such as epochs, learning_rate...
3. Create a schedule with a learning rate that first inreases linearly from
0 to the initial learning rate set in the optimizer (a.k.a. warm up period) and then
decreases linearly from the initial learning rate to 0.

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup  # noqa: 402

LEARNING_RATE = 1e-5
EPSILON = 1e-8
EPOCHS = 5  # Depends on dataset.

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * EPOCHS
)



## Performance mentrics
We will use f1 and accuracy per class.

In [16]:
import numpy as np  # noqa: 402
from sklearn.metrics import f1_score  # noqa: 402


def get_f1_score(predictions, labels):
    predictions_flattened = np.argmax(predictions, axis=1).flatten()
    labels_flattened = labels.flatten()

    return f1_score(labels_flattened, predictions_flattened, average="weighted")


def accuracy_per_class(predictions, labels):
    # Inverse the dictionary.
    labels_lookup_table = {v: k for k, v in labels.items()}

    predictions_flattened = np.argmax(predictions, axis=1).flatten()
    labels_flattened = labels.flatten()

    for label in np.unique(labels_flattened):
        y_predicted = predictions[labels_flattened == label]
        y_true = labels_flattened[labels_flattened == label]

        print(f"Class: {labels_lookup_table[label]}")
        print(f"Accuracy: {len(y_predicted[y_predicted==label])}/{len(y_true)}")

### Training loop

In [17]:
import random  # noqa: 402

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [18]:
def get_inputs_from_batch(batch):
    return {
        "input_ids": batch[0],
        "attention_mask": batch[1],
        "labels": batch[2],
    }


def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = get_inputs_from_batch(batch)

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


for epoch in tqdm(range(1, EPOCHS + 1)):
    model.train()

    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train, desc="Epoch {:1d}".format(epoch), leave=False, disable=False
    )
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = get_inputs_from_batch(batch)

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(
            {"training_loss": "{:.3f}".format(loss.item() / len(batch))}
        )

    torch.save(model.state_dict(), f"data_volume/finetuned_BERT_epoch_{epoch}.model")

    tqdm.write(f"\nEpoch {epoch}")

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f"Training loss: {loss_train_avg}")

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = get_f1_score(predictions, true_vals)
    tqdm.write(f"Validation loss: {val_loss}")
    tqdm.write(f"F1 Score (weighted): {val_f1}")

NameError: name 'EPOCH' is not defined