# 1. Data Preprocessing

In [1]:
import os
import warnings

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import trange
from tqdm.notebook import tqdm
from transformers import BertForSequenceClassification, BertTokenizer

os.chdir('..')

warnings.simplefilter("ignore")

In [2]:
import json

# read classes.json
with open('data/classes.json', encoding='utf-8') as f:
    data = {key: value.split('/')[-1] for (key, value) in json.load(f).items()}
data

{'5908cb5da047d6c9e6dfea6337fb3189.doc': 'Договоры поставки',
 '14711e4fc8e56f0c75856c8837ec04cb.doc': 'Договоры поставки',
 '7eb67b5aecf3f3190aab0a5f8ea32172.docx': 'Договоры поставки',
 'b40a9d048b199d5f4db62a6a2335f2a0.pdf': 'Договоры поставки',
 '84fec112d02288861e7af59f468131fb.docx': 'Договоры поставки',
 'f6377999f8a5aa9a09b03e428ac93153.doc': 'Договоры поставки',
 'a525f050cef10dee3a42468daec064ff.doc': 'Договоры поставки',
 'bec0aa38d1383172690a18d16b07f154.doc': 'Договоры поставки',
 '214d620d9c54bc83111277dd872d3cb2.pdf': 'Договоры поставки',
 'd143c89d002fcef3e2bd2efdb4966f55.doc': 'Договоры поставки',
 '2fd747f38e30ae7ce1c9d6e3b907ac5d.doc': 'Договоры поставки',
 '4c2c295e81f4a6c3e669e8f76c6ce423.docx': 'Договоры поставки',
 '64f58bc6e1207a570a38d771609b2cf1.docx': 'Договоры поставки',
 '7ecd641f2ad81961c17455ed3ebeb2ab.doc': 'Договоры поставки',
 '4e583dc5a5f1499fd2408f3152589f2d.doc': 'Договоры поставки',
 '79104075f8b2ff971d51c495e67af52c.pdf': 'Договоры поставки',
 '19

In [3]:
classes = list(set(list(data.values())))
classes.sort()
classes

['Договоры аренды',
 'Договоры купли-продажи',
 'Договоры оказания услуг',
 'Договоры подряда',
 'Договоры поставки']

In [4]:
from src.neuro import summarize_file

summarize = lambda x: summarize_file(x, sentence_number=5)

df = pd.DataFrame({'label': [], 'text': []})
for key, value in tqdm(data.items(), total=len(data)):
    df.loc[len(df)] = pd.Series({'label': int(classes.index(value)),
                                 'text': str(summarize(
                                     os.path.join('data', 'docs', key)))})

[nltk_data] Downloading package punkt to /home/werserk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/120 [00:00<?, ?it/s]

In [5]:
df.head()

Unnamed: 0,label,text
0,4,"""Поставщик"" гарантирует ""Покупателю"" нормальну..."
1,4,Покупатель имеет право заявить Поставщику об о...
2,4,Копию железнодорожной квитанции Поставщик пере...
3,4,2.2.В случае отказа от заказного Товара до его...
4,4,В письменной форме известить Покупателя (Грузо...


In [6]:
df['label'] = df['label'].astype('uint8')
df['text'] = df['text'].astype('str')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 119
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   120 non-null    uint8 
 1   text    120 non-null    object
dtypes: object(1), uint8(1)
memory usage: 2.0+ KB


In [8]:
tokenizer = BertTokenizer.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    do_lower_case=False
)

In [9]:
input_ids = []
attention_masks = []
texts = df['text'].values
labels = df['label'].values

for text in texts:
    encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=512, pad_to_max_length=True,
                                         return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
input_ids.shape, attention_masks.shape, labels.shape 

(torch.Size([120, 64]), torch.Size([120, 64]), torch.Size([120]))

In [11]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 10

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size=val_ratio,
    shuffle=True,
    stratify=labels)

# Train and validation sets
train_set = TensorDataset(input_ids[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(input_ids[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
    train_set,
    sampler=RandomSampler(train_set),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_set,
    sampler=SequentialSampler(val_set),
    batch_size=batch_size
)

In [12]:
len(train_dataloader), len(validation_dataloader)

(12, 3)

In [13]:
def b_metrics(preds, labels):
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    b_accuracy = (preds == labels).mean()
    return b_accuracy

In [14]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    num_labels=len(classes),
)
model.config.problem_type = 'single_label_classification'

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=5e-5,
                              eps=1e-08
                              )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 3

for _ in trange(epochs, desc='Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids,
                                attention_mask=b_input_mask,
                                labels=b_labels)
            val_loss += eval_output.loss.item()
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation loss: {:.4f}'.format(val_loss / len(validation_dataloader)))
    print('\t - Validation Accuracy: {:.4f}'.format(
        sum(val_accuracy) / len(val_accuracy)))

Epoch:  33%|███▎      | 1/3 [00:02<00:05,  2.72s/it]


	 - Train loss: 1.6537
	 - Validation loss: 1.5373
	 - Validation Accuracy: 0.3167


Epoch:  67%|██████▋   | 2/3 [00:04<00:02,  2.22s/it]


	 - Train loss: 1.3244
	 - Validation loss: 0.7701
	 - Validation Accuracy: 1.0000


Epoch: 100%|██████████| 3/3 [00:06<00:00,  2.15s/it]


	 - Train loss: 0.6676
	 - Validation loss: 0.2184
	 - Validation Accuracy: 1.0000





In [16]:
# Save the model
from src.config import MODEL_PATH

model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

('../data/model_weights/tokenizer_config.json',
 '../data/model_weights/special_tokens_map.json',
 '../data/model_weights/vocab.txt',
 '../data/model_weights/added_tokens.json')