In [1]:
import numpy as np
import os
import pandas as pd
import re
from sklearn import metrics
from sklearn.model_selection import train_test_split
import string
import torch
from torch import nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
import transformers
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel, BertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModel

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F923"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', text)

def clean_text(text):
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\d', ' ', text)
    text = re.sub(' {2,}', ' ', text)
    return text

class CustomDataset(Dataset):
    
    '''Format for the DataLoader'''
        
    def __init__(self, sentences, labels):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        
    def __getitem__(self, index):
        sentence = str(self.sentences[index])
        label = self.labels[index]

        return {
            'text': sentence,
            'label': label
        } 
    def __len__(self):
        return self.len


def train(model, tokenizer, training_loader, device, epoch):
    
    '''Train NN'''
    
    model.train()
    for i, data in enumerate(tqdm(training_loader)):
        
        encoding = tokenizer(
            data['text'],
            add_special_tokens=True,
            padding='longest',
            return_tensors='pt').to(device)
        
        ids = encoding['input_ids']
        mask = encoding['attention_mask']
        targets = data['label'].to(device, dtype = torch.long)
        loss = model(ids, mask, labels=targets)[0]

        if i%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
           
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        

        
def validate(model, tokenizer, testing_loader, device):
    
    '''Validate NN'''
    
    model.eval()
    eval_loss = 0
    predictions , true_labels = [], []
    nb_eval_steps, nb_eval_examples = 0, 0
    with torch.no_grad():
        for i, data in enumerate(testing_loader):

            encoding = tokenizer(
                data['text'],
                add_special_tokens=True,
                padding='longest',
                return_tensors='pt').to(device)
            
            ids = encoding['input_ids']
            mask = encoding['attention_mask']
            targets = data['label'].to(device, dtype = torch.long)
            output = model(ids, mask, labels=targets)
            loss, logits = output[:2]
            logits = logits.detach().cpu().numpy()
            label_ids = targets.to('cpu').numpy()
            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)
            eval_loss += loss.mean().item()
            nb_eval_examples += ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss/nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Recall: {}".format(metrics.recall_score(true_labels, predictions)))
        print("Precision: {}".format(metrics.precision_score(true_labels, predictions)))
        
class BERTClassifier(torch.nn.Module):
    def __init__(self, model_name):
        
        super(BERTClassifier, self).__init__()
        self.model = model_name
        self.l1 = BertForSequenceClassification.from_pretrained(self.model, 
                                                                num_labels = 2,
                                                                output_attentions = False,
                                                                output_hidden_states = False)
    
    def forward(self, ids, mask, labels):
        out= self.l1(ids, mask, labels = labels)
        return out

In [3]:
# Parameters
dir_data = "./"
col_text = "comment"
col_target = "toxic"


TEST_SIZE = 0.3
RANDOM_STATE = 2021
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 2e-05
MODEL_NAME = 'DeepPavlov/rubert-base-cased'

In [4]:
# OK toxic classification challenge dataset 
with open(os.path.join(dir_data, "okcup_train.txt"), encoding='utf8') as f:
    data = f.read().split("\n")
    
texts = []
classes = []
for x in data:
    row = x.split("\t")
    classes.append(int(" ".join(row[1:-1])!='__label__NORMAL')) # At least one class is toxic
    texts.append(row[-1])
    
df_part1 = pd.DataFrame({'comment': texts[:-1], 'toxic':classes[:-1]})

# https://www.kaggle.com/blackmoon/russian-language-toxic-comments
df_part2 = pd.read_csv(os.path.join(dir_data, "toxic_labeled.csv"))
df_part2['toxic'] = df_part2['toxic'].astype('int64')

df = pd.concat([df_part1, df_part2]).reset_index(drop=True)

# Preprocess dataset
df[col_text] = df[col_text].apply(lambda x: remove_emoji(x))
df[col_text] = df[col_text].apply(lambda x: clean_text(x))

In [5]:
df = df.sample(1000)

In [6]:
df_train, df_test = train_test_split(
                                    df,
                                    test_size=TEST_SIZE,
                                    random_state=RANDOM_STATE,
                                    stratify=df[col_target]
                                    )

In [7]:
# To use TPU and seqeval metrics
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev
# !pip -q install seqeval

In [8]:
train_sentences = list(df_train[col_text].values)
train_labels =  df_train[col_target].values

test_sentences = list(df_test[col_text].values)
test_labels = df_test[col_target].values

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = BERTClassifier(MODEL_NAME)
model.to(DEVICE)
#tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("TRAIN Dataset: {}".format(len(train_sentences)))
print("TEST Dataset: {}".format(len(test_sentences)))

training_loader = DataLoader(
CustomDataset(train_sentences, train_labels),
batch_size=BATCH_SIZE,
num_workers=0)

testing_loader = DataLoader(
CustomDataset(test_sentences, test_labels),
batch_size=BATCH_SIZE,
num_workers=0)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    train(model, tokenizer, training_loader, DEVICE, epoch)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN Dataset: 700
TEST Dataset: 300


HBox(children=(IntProgress(value=0, max=175), HTML(value='')))

Epoch: 0, Loss:  0.6728735566139221



HBox(children=(IntProgress(value=0, max=175), HTML(value='')))

Epoch: 1, Loss:  0.16329050064086914



In [12]:
 validate(model, tokenizer, testing_loader, DEVICE)

Validation loss: 0.23974343885978064
Recall: 0.7083333333333334
Precision: 0.7083333333333334
