In [None]:
!pip install -U -q torchmetrics transformers wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.7/251.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import torch
from torch.utils.data import (TensorDataset,
                              Dataset,
                              DataLoader,
                              RandomSampler,
                              SequentialSampler)
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup, set_seed
import torchmetrics
from sklearn.metrics import classification_report
from torch.optim import AdamW
import matplotlib.pyplot as plt
import wandb

In [None]:
df = pd.read_csv('/content/drive/MyDrive/000_DOSSIER/loop/data/cleaned.csv')

In [None]:
LABELS = df['Группа тем'].unique().tolist()
len(LABELS)

26

In [None]:
LABEL2IDX = {l:i for i, l in enumerate(LABELS)}
IDX2LABEL = {i:l for i, l in enumerate(LABELS)}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
print(device.type)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

cuda
Tesla V100-SXM2-16GB


In [None]:
MODEL_NAME = 'ai-forever/ruBert-large'
SEED = 42
EPOCHS = 5
BATCH_SIZE = 6
LEARNING_RATE = 5e-5
MAX_LEN = 390
DROPOUT = .4
WARMUP_STEPS = 0.1

set_seed(seed=SEED)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

In [None]:
def get_tensors(df):
    data = tokenizer(df['Текст инцидента'].to_list(),
                    padding='max_length',
                    truncation=True,
                    max_length=MAX_LEN,
                    return_token_type_ids=False)
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    labels = [LABEL2IDX[l] for l in df['Группа тем'].to_list()]

    input_ids = torch.tensor(input_ids, dtype=torch.long)
    attention_mask = torch.tensor(attention_mask, dtype=torch.float)
    labels = torch.tensor(labels, dtype=torch.long)

    assert len(input_ids) == len(attention_mask) == len(labels)

    return input_ids, attention_mask, labels

In [None]:
df_train, df_test = train_test_split(df, random_state=SEED, test_size=.1, stratify=df['Группа тем'])

In [None]:
train_data = TensorDataset(*get_tensors(df_train))
train_dataloader = DataLoader(
    train_data,
    sampler=RandomSampler(train_data),
    batch_size=BATCH_SIZE
)

test_data = TensorDataset(*get_tensors(df_test))
test_dataloader = DataLoader(
    test_data,
    sampler=SequentialSampler(test_data),
    batch_size=BATCH_SIZE * 2
)

In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABELS))
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
if WARMUP_STEPS > 0:
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=len(train_dataloader) * WARMUP_STEPS * EPOCHS,
        num_training_steps=len(train_dataloader) * EPOCHS
    )

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
wandb.init(
    project='loon-bit-loop-text-classifier'
)

[34m[1mwandb[0m: Currently logged in as: [33mblanchefort[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def train():
    model.train()
    total_loss = 0
    F1 = torchmetrics.classification.MulticlassF1Score(
        num_classes=len(LABELS),
        average='weighted'
    )
    for input_ids, attention_mask, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            labels=labels.to(device))
        preds = torch.argmax(torch.softmax(outputs.logits.detach().cpu(), dim=-1), dim=-1)
        wandb.log({'step_f1': F1(preds, labels).item()})
        wandb.log({'step_loss': outputs.loss.item()})
        total_loss += outputs.loss.item()

        outputs.loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if WARMUP_STEPS > 0:
            scheduler.step()

    total_loss /= len(train_dataloader)
    wandb.log({'epoch_train_loss': total_loss})
    wandb.log({'epoch_train_f1': F1.compute().item()})

In [None]:
@torch.no_grad()
def evaluate():
    model.eval()
    total_loss = 0
    F1 = torchmetrics.classification.MulticlassF1Score(
        num_classes=len(LABELS),
        average='weighted'
    )
    for input_ids, attention_mask, labels in test_dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            labels=labels.to(device))
        preds = torch.argmax(torch.softmax(outputs.logits.detach().cpu(), dim=-1), dim=-1)
        F1(preds, labels)
        total_loss += outputs.loss.item()
    total_loss /= len(test_dataloader)
    wandb.log({'epoch_test_loss': total_loss})
    wandb.log({'epoch_test_f1': F1.compute().item()})

In [None]:
for epoch in range(EPOCHS):
    train()
    evaluate()
    model.to('cpu')
    model.save_pretrained(str(epoch))
    model.to(device)

In [None]:
!cp /content/4/* /content/drive/MyDrive/000_DOSSIER/loop/theme_groups

In [None]:
!ls /content/drive/MyDrive/000_DOSSIER/loop/theme_groups

config.json  model.safetensors
