## Установка библиотек

Для BERT нужна библиотека

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

## Импорт библиотек и данных

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
np.random.seed(13)

In [3]:
path = '/content/drive/MyDrive/Contur/Data/'
name_train = 'train.tsv'

df = pd.read_csv(path + name_train, delimiter='\t')
df.head()

Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [5]:
df['is_fake'] = df.is_fake.astype('int8')
df.is_fake.describe()

count    5758.000000
mean        0.500000
std         0.500043
min         0.000000
25%         0.000000
50%         0.500000
75%         1.000000
max         1.000000
Name: is_fake, dtype: float64

## Работа с моделью

Найдем максимальную длину заголовка, чтобы отправить ее в BERT

In [4]:
MAX_LENGTH = df.title.apply(len).max()
MAX_LENGTH

211

В качестве основной модели BERT был выбран RuBERT от deeppavlov, дообученный на датасете деликатных тем (классификация на политику, наркотики, проституцию, порнографию и т.п.):

[Skoltech / Russian Sensitive Topics](https://huggingface.co/Skoltech/russian-sensitive-topics)

Выбор пал на данную модель, так как заголовки статей чаще всего освещают именно деликатные темы, нежели что-то обыденное.

In [6]:
import torch
import numpy as np
from transformers import BertTokenizer

# Подгружаем предобученный токенизатор
MODEL_NAME = 'Skoltech/russian-sensitive-topics'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Для данных понадобится класс датасета
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [label for label in df['is_fake']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length=MAX_LENGTH+3, truncation=True,
                                return_tensors="pt") for text in df['title']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/524 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

Делим датасет на train/valid/test в соотношении: 80%/10%/10%

In [7]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=13), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train), len(df_val), len(df_test))

4606 576 576


In [8]:
from torch import nn
from transformers import BertModel

# Наша модель, по умолчанию дропаут 0.2,
# Функция активации - сигмоида (т.к. бинарная классификация)
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.2):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask,
                                     return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer_sigma = self.sigmoid(linear_output)

        return final_layer_sigma

In [9]:
from torch.optim import Adam
from tqdm import tqdm


# Функция тренировки модели
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Binary Cross Entropy
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            train_label = train_label.unsqueeze(1).float()
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.round() == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                val_label = val_label.unsqueeze(1).float()
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.round() == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 10
model = BertClassifier(dropout=0.2)
LR = 1e-5

Downloading:   0%|          | 0.00/680M [00:00<?, ?B/s]

Some weights of the model checkpoint at Skoltech/russian-sensitive-topics were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 2303/2303 [11:03<00:00,  3.47it/s]


Epochs: 1 | Train Loss:  0.166             | Train Accuracy:  0.856             | Val Loss:  0.119             | Val Accuracy:  0.915


100%|██████████| 2303/2303 [11:09<00:00,  3.44it/s]


Epochs: 2 | Train Loss:  0.044             | Train Accuracy:  0.971             | Val Loss:  0.111             | Val Accuracy:  0.920


100%|██████████| 2303/2303 [11:17<00:00,  3.40it/s]


Epochs: 3 | Train Loss:  0.016             | Train Accuracy:  0.992             | Val Loss:  0.125             | Val Accuracy:  0.913


100%|██████████| 2303/2303 [11:04<00:00,  3.46it/s]


Epochs: 4 | Train Loss:  0.008             | Train Accuracy:  0.995             | Val Loss:  0.179             | Val Accuracy:  0.917


100%|██████████| 2303/2303 [11:04<00:00,  3.46it/s]


Epochs: 5 | Train Loss:  0.006             | Train Accuracy:  0.998             | Val Loss:  0.219             | Val Accuracy:  0.917


100%|██████████| 2303/2303 [11:11<00:00,  3.43it/s]


Epochs: 6 | Train Loss:  0.004             | Train Accuracy:  0.998             | Val Loss:  0.219             | Val Accuracy:  0.924


100%|██████████| 2303/2303 [11:08<00:00,  3.44it/s]


Epochs: 7 | Train Loss:  0.006             | Train Accuracy:  0.998             | Val Loss:  0.285             | Val Accuracy:  0.892


100%|██████████| 2303/2303 [11:07<00:00,  3.45it/s]


Epochs: 8 | Train Loss:  0.008             | Train Accuracy:  0.996             | Val Loss:  0.179             | Val Accuracy:  0.915


100%|██████████| 2303/2303 [11:07<00:00,  3.45it/s]


Epochs: 9 | Train Loss:  0.006             | Train Accuracy:  0.997             | Val Loss:  0.264             | Val Accuracy:  0.903


100%|██████████| 2303/2303 [11:07<00:00,  3.45it/s]


Epochs: 10 | Train Loss:  0.004             | Train Accuracy:  0.998             | Val Loss:  0.272             | Val Accuracy:  0.905


In [11]:
def evaluate(model, test_data):
    '''
        Функция для оценки модели
        на тестовой выборке

        Input: model, test_data
        Output: test_labels, predictions
    '''
    test = Dataset(test_data)
    outputs = []
    test_labels = []

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_labels.append(test_label.numpy())

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            out = output.round().cpu().numpy()
            outputs.append(out)
            
            test_label = test_label.unsqueeze(1).float()
            acc = (output.round() == test_label).sum().item()

            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

    assert len(outputs) == len(test_labels), 'Something is wrong'

    return test_labels, outputs

In [12]:
true_labels, predictions = evaluate(model, df_test)

Test Accuracy:  0.924


In [13]:
from sklearn.metrics import classification_report

print('Test classification report: ')
print(classification_report(np.array(true_labels).flatten(),
                            np.array(predictions).flatten()))

Test classification report: 
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       315
           1       0.88      0.97      0.92       261

    accuracy                           0.92       576
   macro avg       0.92      0.93      0.92       576
weighted avg       0.93      0.92      0.92       576



Сохраняем модель, не дай Бог колаб снова вылетит

In [14]:
path = '/content/drive/MyDrive/Contur/Models'
model_name = 'RuBERT_from_SKOLKOVO_ru_news.pth'

torch.save(model, path + '/' + model_name)

Если вылетел, подгрузим модель из файла

In [None]:
load = False

In [None]:
if load:
    path = '/content/drive/MyDrive/Contur/Models'
    model_name = 'RuBERT_from_SKOLKOVO_ru_news.pth'
    model = torch.load(path + '/' + model_name)

In [20]:
from torch.utils.data import DataLoader

# Для данных понадобится класс датасета
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [label for label in df['is_fake']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length=MAX_LENGTH+3, truncation=True,
                                return_tensors="pt") for text in df['title']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

def inference(model, df_test):
    '''
        Функция для предсказаний
        Input: model, df_test
        Output: predictions
    '''
    test_data = Dataset(df_test)
    test_loader = DataLoader(dataset=test_data, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    outputs = []
    with torch.no_grad():

        for test_input, _ in test_loader:
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            out = output.round().cpu().numpy()
            outputs.append(out)

    return outputs

In [21]:
path = '/content/drive/MyDrive/Contur/Data/'
name_test = 'test.tsv'
df_test = pd.read_csv(path + name_test, delimiter='\t')

df_test.head()

Unnamed: 0,title,is_fake
0,Роскомнадзор представил реестр сочетаний цвето...,0
1,Ночью под Минском на президентской горе Белара...,0
2,Бывший спичрайтер Юрия Лозы рассказал о трудно...,0
3,"Сельская церковь, собравшая рекордно низкое ко...",0
4,Акции Google рухнули после объявления о переза...,0


Получаем предсказания и помещаем в DataFrame()

In [22]:
test_predictions = inference(model, df_test)

Unnamed: 0,title,is_fake
0,Роскомнадзор представил реестр сочетаний цвето...,[[1.0]]
1,Ночью под Минском на президентской горе Белара...,[[1.0]]
2,Бывший спичрайтер Юрия Лозы рассказал о трудно...,[[1.0]]
3,"Сельская церковь, собравшая рекордно низкое ко...",[[1.0]]
4,Акции Google рухнули после объявления о переза...,[[0.0]]


In [24]:
df_test['is_fake'] = np.array(test_predictions).flatten()
df_test['is_fake'] = df_test.is_fake.astype(np.int8)

df_test.head()

Unnamed: 0,title,is_fake
0,Роскомнадзор представил реестр сочетаний цвето...,1
1,Ночью под Минском на президентской горе Белара...,1
2,Бывший спичрайтер Юрия Лозы рассказал о трудно...,1
3,"Сельская церковь, собравшая рекордно низкое ко...",1
4,Акции Google рухнули после объявления о переза...,0


In [25]:
df_test.is_fake.describe()

count    1000.00000
mean        0.50000
std         0.50025
min         0.00000
25%         0.00000
50%         0.50000
75%         1.00000
max         1.00000
Name: is_fake, dtype: float64

Сохраняем в файл

In [27]:
path = '/content/drive/MyDrive/Contur/Data/'
df_test.to_csv(path + 'predictions.tsv',
               sep='\t',
               index=False)