In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Loading & preparing data

In [None]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/train.csv.zip
  inflating: train.csv               


In [None]:
!unzip /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip

Archive:  /content/drive/MyDrive/Data_files/Porn_data/test.csv.zip
  inflating: test.csv                


In [None]:
import numpy as np
import pandas as pd

In [None]:
%pip install --upgrade transformers datasets accelerate deepspeed
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets

In [None]:
# loading data from CSV to dataframes

train_df = pd.read_csv("/content/train.csv", index_col=0)
test_df = pd.read_csv("/content/test.csv", index_col=0)

In [None]:
train_df.sample(3)

Unnamed: 0_level_0,url,title,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11253,tophotels.ru,The Zign Hotel 4* (Таиланд/Восточный регион/Па...,0
91822,www.1001tur.ru,Туры из Москвы в отель Top Hotel 4* (Турция/Ин...,0
97995,fotostrana.ru,"Анатолий, Москва, 67 лет - фото и страница",0


In [None]:
# data shapes

print(f"{train_df.shape=}")
print(f"{test_df.shape=}")

train_df.shape=(135309, 3)
test_df.shape=(165378, 2)


In [None]:
# checking for rows with NaNs

print("Number of rows with >=1 NaN values")
print(f"train_df: {len(train_df[train_df.isna().any(axis=1)])}")
print(f"test_df: {len(test_df[test_df.isna().any(axis=1)])}")

Number of rows with >=1 NaN values
train_df: 1
test_df: 0


In [None]:
# deleting rows with NaNs

train_df = train_df.dropna()
test_df = test_df.dropna()

print(f"{train_df.shape=}")
print(f"{test_df.shape=}")

train_df.shape=(135308, 3)
test_df.shape=(165378, 2)


### Model sketch: BERT

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets.dataset_dict import DatasetDict

In [None]:
# Using pretrained RuBERT from DeepPavlov  -->  https://huggingface.co/DeepPavlov/rubert-base-cased-sentence

model_name = 'DeepPavlov/rubert-base-cased-sentence'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

# Общая логика дальнейшей работы такая:
# Заморозим у модели все слои за исключением последнего трансформера и прогоним ее в таком виде
# через 5 эпох обучения на train-данных, обновляя веса с помощью алгоритма градиентного спуска "Adam"

In [None]:
# Try some decoding

tokenizer.decode(tokenizer("Мама мыла мылом раму")['input_ids'])

'[CLS] Мама мыла мылом раму [SEP]'

In [None]:
# Adding columns with concatenated "url" and "title" to train and test

train_df['url_plus_title'] = train_df['url'].apply(lambda x: 'web adress: ' + x + '. ') + \
                             train_df['title'].apply(lambda x: 'title: ' + x)

test_df['url_plus_title'] = test_df['url'].apply(lambda x: 'web adress: ' + x + '. ') + \
                            test_df['title'].apply(lambda x: 'title: ' + x)

train_df.head(3)

Unnamed: 0_level_0,url,title,label,url_plus_title
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0,web adress: m.kp.md. title: Экс-министр эконом...
1,www.kp.by,Эта песня стала известна многим телезрителям б...,0,web adress: www.kp.by. title: Эта песня стала ...
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0,web adress: fanserials.tv. title: Банши 4 сезо...


In [None]:
# Separating validation from train

split_idx = round(len(train_df) * 0.8)

train_ds = datasets.Dataset.from_pandas(train_df.iloc[:split_idx])
val_ds = datasets.Dataset.from_pandas(train_df.iloc[split_idx:])
test_ds = datasets.Dataset.from_pandas(test_df)

data_dict = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
})

In [None]:
MAX_LENGTH = 128

def preprocess_function(inputs):
    res = tokenizer(
        inputs['url_plus_title'], padding='max_length',
        max_length=MAX_LENGTH, truncation=True
    )
    res['label'] = inputs['label']

    return res

# Here we preprocess our whole data
data_prep = data_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/108246 [00:00<?, ? examples/s]

Map:   0%|          | 0/27062 [00:00<?, ? examples/s]

In [None]:
data_prep['train'][0]['input_ids'][:10]

[101, 13510, 11352, 31255, 156, 256, 132, 252, 263, 132]

In [None]:
# Loaders formation

val_set = data_prep['validation']
train_set = data_prep['train']

val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=64, shuffle=False, collate_fn=transformers.default_data_collator, num_workers=2
)

train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=64, shuffle=False, collate_fn=transformers.default_data_collator, num_workers=2
)

In [None]:
# Trial inference

batch = next(iter(val_loader))

with torch.no_grad():
    output = model(
        input_ids=batch['input_ids'],
        attention_mask=batch['attention_mask'],
        token_type_ids=batch['token_type_ids'],
        labels=batch['labels']
    )

output['loss'].item()

0.6015498638153076

In [None]:
# Take a look at the model's output

output

SequenceClassifierOutput(loss=tensor(0.6223), logits=tensor([[ 0.1175, -0.2458],
        [ 0.0650, -0.2045],
        [ 0.2057, -0.3833],
        [ 0.1869, -0.1434],
        [ 0.1308, -0.1175],
        [-0.0243, -0.4112],
        [ 0.1810, -0.2153],
        [ 0.1462, -0.3805],
        [ 0.1703, -0.3079],
        [ 0.1899, -0.2690],
        [ 0.0866, -0.3704],
        [-0.0623, -0.2519],
        [-0.0019, -0.2188],
        [ 0.1326, -0.2431],
        [ 0.2049, -0.0683],
        [ 0.1326, -0.0262],
        [ 0.0136, -0.0776],
        [ 0.0460, -0.2056],
        [-0.0617, -0.1799],
        [ 0.1648, -0.1759],
        [ 0.0576, -0.1541],
        [-0.0450, -0.2402],
        [-0.0203, -0.2267],
        [ 0.2383, -0.2211],
        [ 0.1190, -0.2546],
        [ 0.1325, -0.0529],
        [ 0.1907, -0.0623],
        [ 0.1672, -0.3981],
        [ 0.1060, -0.2859],
        [ 0.1518, -0.4055],
        [ 0.1346, -0.3599],
        [-0.0789, -0.1561]]), hidden_states=None, attentions=None)

### Model tuning

In [None]:
# Closer look at the model's architecture
# List some parameters: embedding, 1st transformer, pooler layers

params = list(model.named_parameters())

print('The RuBERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[:3]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer (all number of transformers - 12)====\n')
for p in params[3:19]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The RuBERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (119547, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)

==== First Transformer (all number of transformers - 12)====

bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)
bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attent

In [None]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

n_epochs = 5
total_steps = len(train_loader) * n_epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # default value in run_glue.py
                                            num_training_steps=total_steps)



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to calculate required metrics
def get_metrics(labels, preds):
    labels = np.array(labels)
    preds = np.array(preds)

    acc = accuracy_score(labels, preds)
    pr = precision_score(labels, preds)
    rec = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return acc, pr, rec, f1

In [None]:
# quick check of "get_metrics"

a = [1 for i in range(10)]
b = [1 for i in range(5)] + [0 for i in range(5)]

get_metrics(b, a)

(0.5, 0.5, 1.0, 0.6666666666666666)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Freezing layers (all transformers except the last one)

for name, param in model.named_parameters():
    if "layer.11" not in name:
        param.requires_grad = False

In [None]:
# Check up for unfreezed parameters

for name, param in model.named_parameters():
    if param.requires_grad: print(name)

bert.encoder.layer.11.attention.self.query.weight
bert.encoder.layer.11.attention.self.query.bias
bert.encoder.layer.11.attention.self.key.weight
bert.encoder.layer.11.attention.self.key.bias
bert.encoder.layer.11.attention.self.value.weight
bert.encoder.layer.11.attention.self.value.bias
bert.encoder.layer.11.attention.output.dense.weight
bert.encoder.layer.11.attention.output.dense.bias
bert.encoder.layer.11.attention.output.LayerNorm.weight
bert.encoder.layer.11.attention.output.LayerNorm.bias
bert.encoder.layer.11.intermediate.dense.weight
bert.encoder.layer.11.intermediate.dense.bias
bert.encoder.layer.11.output.dense.weight
bert.encoder.layer.11.output.dense.bias
bert.encoder.layer.11.output.LayerNorm.weight
bert.encoder.layer.11.output.LayerNorm.bias


In [None]:
# Trainloop! Here we go!

from tqdm.notebook import tqdm
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

training_stats = []
total_t0 = time.time()


for epoch_i in range(n_epochs):
    print()
    print(f"-------------------- Epoch {epoch_i + 1} / {n_epochs} --------------------")

    # -----------------------------------------------Training----------------------------------------------- #
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in tqdm(enumerate(train_loader), total=len(train_loader)):

        if step % 200 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:}  of  {:}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))


        # batch contains three pytorch tensors:
        batch_input_ids = batch['input_ids'].to(device)
        batch_input_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        model.zero_grad()
        model_output = model(input_ids=batch_input_ids,
                             token_type_ids=None,
                             attention_mask=batch_input_mask,
                             labels=batch_labels)

        total_train_loss += model_output['loss'].item()
        model_output['loss'].backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    training_time = format_time(time.time() - t0)

    print()
    print(f"  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epcoh took: {training_time}")

    # -----------------------------------------------Validation----------------------------------------------- #

    print()
    print("Running Validation...")

    model.eval()
    t0 = time.time()
    total_eval_loss = 0
    nb_eval_steps = 0
    val_true = val_set['label']
    val_preds = np.zeros(len(val_ds))

    # Evaluate data for one epoch
    for i, batch in tqdm(enumerate(val_loader), total=len(val_loader)):

        batch_input_ids = batch['input_ids'].to(device)
        batch_input_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        with torch.no_grad():
            model_output = model(input_ids=batch_input_ids,
                                 token_type_ids=None,
                                 attention_mask=batch_input_mask,
                                 labels=batch_labels)

        total_eval_loss += model_output['loss'].item()
        logits = model_output['logits'].detach().cpu().numpy()
        preds_flat = np.argmax(logits, axis=1).flatten()

        val_preds[i * 64: (i + 1) * 64] = preds_flat

    accuracy, precision, recall, f1 = get_metrics(val_true, val_preds)

    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1:        {f1:.4f}")

    avg_val_loss = total_eval_loss / len(val_loader)
    validation_time = format_time(time.time() - t0)

    print(f"  Validation loss: {avg_val_loss:.2f}")
    print(f"  Validation took: {validation_time}")

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print()
print("Training complete.")
print(f"Total training took {format_time(time.time()-total_t0)} (h:mm:ss)")


-------------------- Epoch 1 / 5 --------------------
Training...


  0%|          | 0/1692 [00:00<?, ?it/s]

  Batch 200  of  1692.    Elapsed: 0:01:47.
  Batch 400  of  1692.    Elapsed: 0:03:33.
  Batch 600  of  1692.    Elapsed: 0:05:20.
  Batch 800  of  1692.    Elapsed: 0:07:06.
  Batch 1000  of  1692.    Elapsed: 0:08:53.
  Batch 1200  of  1692.    Elapsed: 0:10:39.
  Batch 1400  of  1692.    Elapsed: 0:12:25.
  Batch 1600  of  1692.    Elapsed: 0:14:12.

  Average training loss: 0.07
  Training epcoh took: 0:15:00

Running Validation...


  0%|          | 0/423 [00:00<?, ?it/s]

  Accuracy:  0.9869
  Precision: 0.9570
  Recall:    0.9368
  F1:        0.9468
  Validation loss: 0.04
  Validation took: 0:03:18

-------------------- Epoch 2 / 5 --------------------
Training...


  0%|          | 0/1692 [00:00<?, ?it/s]

  Batch 200  of  1692.    Elapsed: 0:01:47.
  Batch 400  of  1692.    Elapsed: 0:03:33.
  Batch 600  of  1692.    Elapsed: 0:05:20.
  Batch 800  of  1692.    Elapsed: 0:07:06.
  Batch 1000  of  1692.    Elapsed: 0:08:53.
  Batch 1200  of  1692.    Elapsed: 0:10:39.
  Batch 1400  of  1692.    Elapsed: 0:12:26.
  Batch 1600  of  1692.    Elapsed: 0:14:13.

  Average training loss: 0.05
  Training epcoh took: 0:15:01

Running Validation...


  0%|          | 0/423 [00:00<?, ?it/s]

  Accuracy:  0.9893
  Precision: 0.9678
  Recall:    0.9457
  F1:        0.9566
  Validation loss: 0.04
  Validation took: 0:03:19

-------------------- Epoch 3 / 5 --------------------
Training...


  0%|          | 0/1692 [00:00<?, ?it/s]

  Batch 200  of  1692.    Elapsed: 0:01:47.
  Batch 400  of  1692.    Elapsed: 0:03:33.
  Batch 600  of  1692.    Elapsed: 0:05:19.
  Batch 800  of  1692.    Elapsed: 0:07:06.
  Batch 1000  of  1692.    Elapsed: 0:08:52.
  Batch 1200  of  1692.    Elapsed: 0:10:38.
  Batch 1400  of  1692.    Elapsed: 0:12:25.
  Batch 1600  of  1692.    Elapsed: 0:14:11.

  Average training loss: 0.04
  Training epcoh took: 0:14:59

Running Validation...


  0%|          | 0/423 [00:00<?, ?it/s]

  Accuracy:  0.9905
  Precision: 0.9753
  Recall:    0.9475
  F1:        0.9612
  Validation loss: 0.03
  Validation took: 0:03:19

-------------------- Epoch 4 / 5 --------------------
Training...


  0%|          | 0/1692 [00:00<?, ?it/s]

  Batch 200  of  1692.    Elapsed: 0:01:46.
  Batch 400  of  1692.    Elapsed: 0:03:33.
  Batch 600  of  1692.    Elapsed: 0:05:19.
  Batch 800  of  1692.    Elapsed: 0:07:05.
  Batch 1000  of  1692.    Elapsed: 0:08:52.
  Batch 1200  of  1692.    Elapsed: 0:10:38.
  Batch 1400  of  1692.    Elapsed: 0:12:25.
  Batch 1600  of  1692.    Elapsed: 0:14:12.

  Average training loss: 0.04
  Training epcoh took: 0:15:00

Running Validation...


  0%|          | 0/423 [00:00<?, ?it/s]

  Accuracy:  0.9906
  Precision: 0.9676
  Recall:    0.9567
  F1:        0.9621
  Validation loss: 0.03
  Validation took: 0:03:19

-------------------- Epoch 5 / 5 --------------------
Training...


  0%|          | 0/1692 [00:00<?, ?it/s]

  Batch 200  of  1692.    Elapsed: 0:01:46.
  Batch 400  of  1692.    Elapsed: 0:03:33.
  Batch 600  of  1692.    Elapsed: 0:05:19.
  Batch 800  of  1692.    Elapsed: 0:07:05.
  Batch 1000  of  1692.    Elapsed: 0:08:51.
  Batch 1200  of  1692.    Elapsed: 0:10:37.
  Batch 1400  of  1692.    Elapsed: 0:12:23.
  Batch 1600  of  1692.    Elapsed: 0:14:09.

  Average training loss: 0.04
  Training epcoh took: 0:14:57

Running Validation...


  0%|          | 0/423 [00:00<?, ?it/s]

  Accuracy:  0.9912
  Precision: 0.9757
  Recall:    0.9528
  F1:        0.9641
  Validation loss: 0.03
  Validation took: 0:03:17

Training complete.
Total training took 1:31:31 (h:mm:ss)


In [None]:
# Saving tuned model locally

# model.save_pretrained("/content/drive/MyDrive/AI_Notebooks/rubert-porn-detector")
#   --> Версия 0: работа только с "title", seq_len=64, batch_size=32, n_epoch=4

model.save_pretrained("/content/drive/MyDrive/AI_Notebooks/PD/rubert-concat-sl128-bs64")
#   --> Версия 1: грубая склейка "url" и "title", seq_len=128, batch_size=64, n_epoch=5

[2024-10-23 12:44:53,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


### Making test predictions

In [None]:
MAX_LENGTH = 128

def test_preprocess_function(inputs):
    res = tokenizer(
        inputs['url_plus_title'], padding='max_length',
        max_length=MAX_LENGTH, truncation=True
    )
    # res['label'] = inputs['label']

    return res

# preprocess test data
test_prep = test_ds.map(test_preprocess_function, batched=True)

Map:   0%|          | 0/165378 [00:00<?, ? examples/s]

In [None]:
test_loader = torch.utils.data.DataLoader(
    test_prep, batch_size=64, shuffle=False, collate_fn=transformers.default_data_collator, num_workers=2
)

In [None]:
# Loop for test predictions

test_preds = np.zeros(len(test_ds))

for i, batch in tqdm(enumerate(test_loader), total=len(test_loader)):

    batch_input_ids = batch['input_ids'].to(device)
    batch_input_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        model_output = model(input_ids=batch_input_ids,
                             token_type_ids=None,
                             attention_mask=batch_input_mask,
                             labels=None)

    preds = model_output['logits'].detach().cpu().numpy()
    preds_flat = np.argmax(preds, axis=1).flatten()
    test_preds[i * 64: (i + 1) * 64] = preds_flat

  0%|          | 0/2585 [00:00<?, ?it/s]

### Submission formation

In [None]:
# type/size checking

print(len(test_ds['ID']), len(test_preds))
print(type(test_ds['ID']), type(test_preds))

165378 165378
<class 'list'> <class 'numpy.ndarray'>


In [None]:
test_preds[:10]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [None]:
# Making submission dataframe

subm = pd.DataFrame({
    'ID': test_ds['ID'],
    'label': test_preds.astype(int)
})

subm = subm.set_index('ID')

In [None]:
subm.head(5)

Unnamed: 0_level_0,label
ID,Unnamed: 1_level_1
135309,0
135310,0
135311,0
135312,1
135313,0


In [None]:
subm.to_csv("/content/drive/MyDrive/AI_Notebooks/PD/rubert-subm-2.csv")

### Analysing model's mistakes on validation
##### (Актуально для "Версии 0"... для "Версии 1" не анализировал)

In [None]:
# Loading tokenizer and locally saved model

tokenizer = transformers.AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = transformers.AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/AI_Notebooks/PD/rubert-porn-detector")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
MAX_LENGTH = 64

def test_preprocess_function(inputs):
    res = tokenizer(
        inputs['title'], padding='max_length',
        max_length=MAX_LENGTH, truncation=True
    )
    # res['label'] = inputs['label']

    return res

val_prep = val_ds.map(test_preprocess_function, batched=True)

Map:   0%|          | 0/27062 [00:00<?, ? examples/s]

In [None]:
# Making predictions on validation data

from tqdm.notebook import tqdm

val_preds = np.zeros(len(val_ds))
val_loader = torch.utils.data.DataLoader(
    val_prep, batch_size=32, shuffle=False, collate_fn=transformers.default_data_collator, num_workers=2
)

for i, batch in tqdm(enumerate(val_loader), total=len(val_loader)):

    batch_input_ids = batch['input_ids'].to(device)
    batch_input_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        model_output = model(input_ids=batch_input_ids,
                             token_type_ids=None,
                             attention_mask=batch_input_mask,
                             labels=None)

    preds = model_output['logits'].detach().cpu().numpy()
    preds_flat = np.argmax(preds, axis=1).flatten()
    val_preds[i * 32: (i + 1) * 32] = preds_flat

  0%|          | 0/846 [00:00<?, ?it/s]

In [None]:
# check for shape matching
len(val_ds['label']), len(val_preds)

(27062, 27062)

In [None]:
# Look at metrics

valid_df = pd.DataFrame(val_ds)
mistaked_df = valid_df[valid_df['label'] != val_preds]

FN = sum(np.logical_and((valid_df['label'] != val_preds).to_numpy(), (valid_df['label'] == 1).to_numpy()))
TN = sum(np.logical_and((valid_df['label'] == val_preds).to_numpy(), (valid_df['label'] == 0).to_numpy()))
FP = sum(np.logical_and((valid_df['label'] != val_preds).to_numpy(), (valid_df['label'] == 0).to_numpy()))
TP = sum(np.logical_and((valid_df['label'] == val_preds).to_numpy(), (valid_df['label'] == 1).to_numpy()))

print(f"0-labeled mistaked: {FP} of {TN + FP}")
print(f"1-labeled mistaked: {FN} of {TP + FN}\n")
print(f"Precision: {TP / (TP + FP):.4f}")
print(f"Recall: {TP / (TP + FN):.4f}")
print(f"Accuracy: { (TP + TN) / (TP + TN + FP + FN):.4f}")

0-labeled mistaked: 144 of 23691
1-labeled mistaked: 248 of 3371

Precision: 0.9559
Recall: 0.9264
Accuracy: 0.9855


In [None]:
# False-positive and False-negative sub-datasets

fp_data = mistaked_df[mistaked_df['label'] == 0]
fn_data = mistaked_df[mistaked_df['label'] == 1]

In [None]:
# Here we can look at False-negative mistakes (missed porn data) and come up with some ideas

idx = 10
print(fn_data['url'].iloc[idx])
print(fn_data['title'].iloc[idx])

fapreactor.com
Lolicon Hentai :: Megumin :: Oral Хентай :: Хентай с цензурой (Censored Hentai) :: Mukka :: KonoSuba :: Хентай (Hentai) :: Anime Art (Аниме арт, Аниме-арт) :: секретные разделы (скрытые разделы joyreactor) :: Anime (Аниме) / голые девки, члены, голые девк


In [None]:
print([tokenizer.decode(id) for id in tokenizer("erokrad.net")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("erokrad net")['input_ids']])
print()
print([tokenizer.decode(id) for id in tokenizer("fapreactor.com")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("stackoverflow.com")['input_ids']])

['[CLS]', 'er', '##ok', '##rad', '.', 'net', '[SEP]']
['[CLS]', 'er', '##ok', '##rad', 'net', '[SEP]']

['[CLS]', 'fa', '##pre', '##act', '##or', '.', 'com', '[SEP]']
['[CLS]', 'st', '##ack', '##over', '##flow', '.', 'com', '[SEP]']


In [None]:
tokenizer_1 = transformers.BertTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Заметка по FN --> токенизатор плохо работает с капсовыми словами

print([tokenizer.decode(id) for id in tokenizer("яблоко")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("ЯБЛОКО")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("Яблоко")['input_ids']])

print([tokenizer.decode(id) for id in tokenizer("порно")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("ПОРНО")['input_ids']])

['[CLS]', 'яблоко', '[SEP]']
['[CLS]', 'Я', '##БЛ', '##ОК', '##О', '[SEP]']
['[CLS]', 'Яблоко', '[SEP]']
['[CLS]', 'порно', '[SEP]']
['[CLS]', 'ПО', '##Р', '##НО', '[SEP]']


In [None]:
print([tokenizer.decode(id) for id in tokenizer("kinosex")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("webcamsbabe.com")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("fapreactor.com")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("sexpics.sexviptube.com")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("onlyindianporn.net")['input_ids']])

print([tokenizer.decode(id) for id in tokenizer("webcamsbabe.com")['input_ids']])
print([tokenizer.decode(id) for id in tokenizer("webcamsbabe.com")['input_ids']])

['[CLS]', 'ki', '##nos', '##ex', '[SEP]']
['[CLS]', 'web', '##ca', '##ms', '##ba', '##be', '.', 'com', '[SEP]']
['[CLS]', 'fa', '##pre', '##act', '##or', '.', 'com', '[SEP]']
['[CLS]', 'sex', '##pi', '##cs', '.', 'sex', '##vi', '##pt', '##ube', '.', 'com', '[SEP]']
['[CLS]', 'only', '##ind', '##ian', '##por', '##n', '.', 'net', '[SEP]']
['[CLS]', 'web', '##ca', '##ms', '##ba', '##be', '.', 'com', '[SEP]']
['[CLS]', 'web', '##ca', '##ms', '##ba', '##be', '.', 'com', '[SEP]']
