In [126]:
# Python 
import os
import warnings
import logging
from typing import Mapping, List
from pprint import pprint
import plotly.express as px

# Numpy and Pandas 
import numpy as np
import pandas as pd

# PyTorch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Transformers 
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoConfig

# Catalyst
from catalyst.dl import SupervisedRunner, Runner
from catalyst.callbacks import AccuracyCallback, AUCCallback, OptimizerCallback
from catalyst.callbacks import CheckpointCallback
from catalyst.utils import set_global_seed, prepare_cudnn, load_checkpoint, unpack_checkpoint

In [225]:
import transformers

In [226]:
transformers.__version__

'4.18.0'

In [127]:
pd.set_option('display.max_colwidth', 100)

In [128]:
# to reproduce, download the data and customize this path
PATH_TO_MODEL = '/home/timur/Desktop/jupyter_wf/bert-finetuning-catalyst/logdir/model.best.pth'

In [129]:
# to reproduce, download the data and customize this path
PATH_TO_DATA = '/home/timur/Desktop/jupyter_wf/contacts-in-item-TimuJ/model_notebook/'

In [130]:
train_df = pd.read_csv(PATH_TO_DATA + 'train.csv').fillna('')
valid_df = pd.read_csv(PATH_TO_DATA + 'valid.csv').fillna('')
test_df = pd.read_csv(PATH_TO_DATA + 'test.csv').fillna('')

In [131]:
test_df

Unnamed: 0.1,Unnamed: 0,title,description,subcategory,category,price,region,city,datetime_submitted,is_bad
0,11365,AirPods,"Airpods /\nAirPods 2 CАMAЯ ЛУЧШAЯ и ТOЧНАЯ КOПИЯ оригинала - 5:5, ПОCЛEДНЯЯ MОДЕЛЬ - октябрь 205...",Аудио и видео,Бытовая электроника,3489.0,Россия,Москва,2019-10-13 12:43:42.299084,0
1,11366,Механизм стеклоочистителя передний Nissan Serena 4,Механизм стеклоочистителя Nissan Serena 1 C62M GA13DE 1991 передний (б/у)/\n/\n Марка: Nissan/\n...,Запчасти и аксессуары,Транспорт,2500.0,Липецкая область,Липецк,2019-10-13 12:45:06.628388,0
2,11367,Нш-42в-4л,новое. тел 89024003360,Запчасти и аксессуары,Транспорт,3000.0,Челябинская область,Челябинск,2019-10-13 12:45:44.957825,1
3,11368,Шины,Два ската DUNLOP Япония 683 -53-63 состояние новых использовались один сезон 1 месяца цена за 2 ...,Запчасти и аксессуары,Транспорт,4000.0,Ростовская область,Пролетарск,2019-10-13 12:45:45.456624,1
4,11369,Живые раки,"В продаже рак дикий, речной./\nОптом от 50 кг, в розницу от 5 кг./\nЦена опт / розница:/\n/\nмел...",Продукты питания,Для дома и дачи,400.0,Россия,Москва,2019-10-13 12:48:10.767700,0
...,...,...,...,...,...,...,...,...,...,...
4867,16232,Офисное помещение,Сдаются офисное помещение по Ул Калинина 80. Помещения находятся в офисном центре. В стоимость а...,Коммерческая недвижимость,Недвижимость,9200.0,Чувашия,Чебоксары,2019-10-14 23:57:30.094904,0
4868,16233,iPhone 8 Plus Silver 25GB,"Оригинальный ростест айфон. Идеальное состояние, ни разу не ремонтировался и не разбирался. /\nК...",Телефоны,Бытовая электроника,20000.0,Татарстан,Казань,2019-10-14 23:57:50.610616,0
4869,16234,"6-к квартира, 54 м², 5/9 эт.",_________________________________________________________/\n /\nПРОСТОРНАЯ КВАРТИРА С ХОРОШЕЙ ПЛ...,Квартиры,Недвижимость,3300000.0,Ставропольский край,Пятигорск,2019-10-14 23:58:02.781579,0
4870,16235,"4-к квартира, 60 м², 9/9 эт.",/\n /\n● Работаем БЕЗ ПЕРЕРЫВОВ И ВЫХОДНЫХ с 9:00 до 12:00./\n /\n● Обменяем и продадим Вашу не...,Квартиры,Недвижимость,2300000.0,Ставропольский край,Пятигорск,2019-10-14 23:59:01.435691,0


In [132]:
MODEL_NAME = 'DeepPavlov/distilrubert-base-cased-conversational' # pretrained model from Transformers
LOG_DIR = "./logdir_avito_descr"    # for training logs and tensorboard visualizations
NUM_EPOCHS = 5                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 8                        # depends on your available GPU memory (in combination with max seq length)
MAX_SEQ_LENGTH = 256                   # depends on your available GPU memory (in combination with batch size)
LEARN_RATE = 1e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 4                        # one optimization step for that many backward passes
SEED = 31                              # random seed for reproducibility

In [133]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """
    def __init__(self,
                 texts: List[str],
                 labels: List[str] = None,
                 label_dict: Mapping[str, int] = None,
                 max_seq_length: int = 256,
                 model_name: str = 'DeepPavlov/distilrubert-base-cased-conversational'):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)),
                                       range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger(
            "transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]
        x_encoded = self.tokenizer.encode(
            x,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_tensors="pt",
        ).squeeze(0)

        # padding short texts
        true_seq_length = x_encoded.size(0)
        pad_size = self.max_seq_length - true_seq_length
        pad_ids = torch.Tensor([self.pad_vid] * pad_size).long()
        x_tensor = torch.cat((x_encoded, pad_ids))

        # dealing with attention masks - there's a 1 for each input token and
        # if the sequence is shorter that `max_seq_length` then the rest is
        # padded with zeroes. Attention mask will be passed to the model in
        # order to compute attention scores only with input data
        # ignoring padding
        mask = torch.ones_like(x_encoded, dtype=torch.int8)
        mask_pad = torch.zeros_like(pad_ids, dtype=torch.int8)
        mask = torch.cat((mask, mask_pad))

        output_dict = {
            'features' : x_tensor,
            'attention_mask' : mask
        }

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor(
                [self.label_dict.get(y, -1)]
            ).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict

In [134]:
train_dataset = TextClassificationDataset(
    texts=train_df['description'].values.tolist(),
    labels=train_df['is_bad'].values.tolist(),
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

valid_dataset = TextClassificationDataset(
    texts=valid_df['description'].values.tolist(),
    labels=valid_df['is_bad'].values.tolist(),
    label_dict=train_dataset.label_dict,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)

test_dataset = TextClassificationDataset(
    texts=test_df['description'].values.tolist(),
    labels=None,
    label_dict=None,
    max_seq_length=MAX_SEQ_LENGTH,
    model_name=MODEL_NAME
)



In [135]:
NUM_CLASSES = len(train_dataset.label_dict)

In [136]:
train_val_loaders = {
    "train": DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True),
    "valid": DataLoader(dataset=valid_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)    
}

In [137]:
test_loaders = {
    "test": DataLoader(
            dataset=test_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
        )
    }

In [138]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.3):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_classes)

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        bert_output = self.model(input_ids=features, attention_mask=attention_mask, head_mask=head_mask)
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)

        return scores

## Loading pth model 

In [139]:
device = torch.device("cuda")
model = BertForSequenceClassification(pretrained_model_name=MODEL_NAME, num_classes=2)
model.load_state_dict(torch.load(PATH_TO_MODEL))
model.to(device)

Some weights of the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForSequenceClassification(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1

In [140]:
model.eval()

BertForSequenceClassification(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1

In [141]:
runner = SupervisedRunner(model=model, input_key=("features", "attention_mask"))

## Predictions

In [159]:
test_pred_scores = np.concatenate(
    [pred["logits"].detach().cpu().numpy() for pred in runner.predict_loader(loader=test_loaders["test"])]
)

AttributeError: 'numpy.ndarray' object has no attribute 'softmax'

In [None]:
import torch.nn.functional as F

In [174]:
test_score_probs = F.softmax(torch.from_numpy(test_pred_scores), dim=1)

In [184]:
test_score_probs[:, -1].numpy()

array([7.9508769e-05, 6.7123474e-05, 9.9641681e-01, ..., 2.0889622e-04,
       9.8866665e-05, 9.8717308e-01], dtype=float32)

In [216]:
df = pd.DataFrame(test_score_probs[:, -1].numpy(), columns=['prediction'])

In [218]:
df = df.reset_index()

In [227]:
df['perdiction'] = test_score_probs[:, -1].numpy()

In [228]:
df

Unnamed: 0,index,prediction,perdiction
0,0,0.000080,0.000080
1,1,0.000067,0.000067
2,2,0.996417,0.996417
3,3,0.998438,0.998438
4,4,0.001098,0.001098
...,...,...,...
4867,4867,0.000361,0.000361
4868,4868,0.000079,0.000079
4869,4869,0.000209,0.000209
4870,4870,0.000099,0.000099


## Evaluating metrics

In [148]:
from sklearn.metrics import roc_auc_score

In [149]:
LABELS = sorted(train_df['is_bad'].unique())

In [151]:
test_pred = [LABELS[i] for i in test_pred_scores.argmax(axis=1)]

In [155]:
roc_auc_score(test_df['is_bad'], test_pred)

0.9350010735567987

In [186]:
test_pred[-1]

1

In [143]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Aug 28 18:19:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| N/A   61C    P0    43W /  N/A |   4002MiB /  6144MiB |     94%      Default |
|                               |   

In [144]:
torch.cuda.empty_cache()