In [3]:
import sys
sys.path.append('/home/saiko/ML/mipt/premoderation_txt/toxic_bert/lib')

In [4]:
import numpy as np
import pandas as pd
import re
import torch 

from bert.bert_dataset import BertDatset
from bert.trainer import BertPredictor
from bert.bert_model import MyBertModel
from bert.optim import configure_optimizers
from bert.loader import get_dataloader
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)


In [5]:
DEVICE = torch.device('cuda')
# ispras_df = pd.read_csv('./data/input/ispras/live_journal.csv')
concat_df = pd.read_pickle('../../data/intermediate/concat_df_new_labels.pickle')
concat_df = concat_df[~concat_df['text'].isnull()]

In [4]:
model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased').to(DEVICE)
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

### Model

In [6]:
model_params = {
    'lr': 1e-5
}

kfold_params = {
    'split_rand_state': 33,
    'split_ratio': .35
}

In [7]:
criterion = BCEWithLogitsLoss().cuda()


bert_model = MyBertModel(model,
                         n_classes=1).to(DEVICE)


train_df, val_df = train_test_split(concat_df,
                                    test_size=(1-kfold_params['split_ratio']), 
                                    random_state=kfold_params['split_rand_state'])

train_loader = get_dataloader(train_df,
                              tokenizer,
                              label_column='toxic', 
                              batch_size=4)

val_loader = get_dataloader(val_df,
                            tokenizer, 
                            label_column='toxic',
                            batch_size=4)

    
optimizer = configure_optimizers(bert_model, model_params['lr'])

print(f'train --> {train_df.shape[0]}, val --> {val_df.shape[0]}')
print(f"labels: train --> {train_df['toxic'].sum()}, val --> {val_df['toxic'].sum()}")

train --> 22178, val --> 41188
labels: train --> 7168.0, val --> 13369.0


In [8]:
description = 'toxic + insult new labels'
trainer = BertPredictor(model=bert_model,
                       train_loader=train_loader,
                       criterion=criterion,
                       optimizer=optimizer,
                       split_rand_state=kfold_params['split_rand_state'],
                        metric=roc_auc_score,
                        description=description,
                       val_loader=val_loader,
                       epochs_count=2,
                       result_dir='../../data/result/bert_output/',
                       num_labels=1)


trainer.fit()

  0%|          | 0/5545 [00:00<?, ?it/s]

../../data/result/bert_output/07-06-13


[1 / 2] Train: Loss = 0.31987: 100%|██████████| 5545/5545 [26:06<00:00,  3.54it/s]
100%|██████████| 10297/10297 [14:00<00:00, 12.25it/s]
  0%|          | 0/5545 [00:00<?, ?it/s]

val_metric_score --> 0.9612731994970716


[2 / 2] Train: Loss = 0.17915: 100%|██████████| 5545/5545 [26:10<00:00,  3.53it/s]
100%|██████████| 10297/10297 [14:09<00:00, 12.12it/s]

val_metric_score --> 0.9607309277080982



