In [98]:
import os
from preprocessing import TextCleaner
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import eli5
import json
import joblib
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
import random
# Handle doc/docx/pdf/rtf
import textract
import docx2txt
import fitz
import nltk
import re
import string
from striprtf.striprtf import rtf_to_text

In [99]:
def read_file(filename: str):
    if filename.endswith("docx"):
        text = docx2txt.process(filename)
    elif filename.endswith("pdf"):
        doc = fitz.open(filename)
        text = []
        for page in doc:
            text.append(page.get_text())
        text = " ".join(text)
    elif filename.endswith("doc"):
        text = reinterpret(textract.process(filename))
        text = remove_convert_info(text)
    elif filename.endswith("rtf"):
        with open(filename) as f:
            content = f.read()
            text = rtf_to_text(content)
    else:
        raise ValueError(
            "Does not support the current file extension, currently supported files: 'docx', 'doc' and 'pdf'"
        )
    return text


def read_json(filename: str):
    with open(filename, "r") as f:
        classes = json.load(f)
    return classes


def reinterpret(text: str):
    return text.decode('utf8')


def remove_convert_info(text: str):
    for i, s in enumerate(text):
        if s == ":":
            break
    return text[i + 6:]


def save_cls2index(filename: str = "hacka-aka-embedika/cls2index.json"):
    with open(filename, "w") as f:
        json.dump(cls2index, f, ensure_ascii=False)


def metrics(true, preds):
    acc = accuracy_score(true, preds)
    p = precision_score(true, preds, average='micro')
    f1 = f1_score(true, preds, average='micro')
    recall = recall_score(true, preds, average='micro')
    return {
        "accuracy_score": acc,
        "precision_score": p,
        "f1_score": f1,
        "recall_score": recall,
    }

def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
seed_all(42)

In [100]:
json_file = "hacka-aka-embedika/classes.json"
dataset_path = "hacka-aka-embedika/docs"
classes = read_json(json_file)

In [101]:
data = {"filename": [], "text": [], "class": []}
cls2index = {}
for filename, cls in classes.items():
    print(filename)
    data["filename"].append(filename)
    text = read_file(os.path.join(dataset_path, filename))
    data["text"].append(text)
    cls = cls.strip()
    data["class"].append(cls)
    if cls not in cls2index:
        cls2index[cls] = len(cls2index)

5908cb5da047d6c9e6dfea6337fb3189.doc
14711e4fc8e56f0c75856c8837ec04cb.doc
7eb67b5aecf3f3190aab0a5f8ea32172.docx
b40a9d048b199d5f4db62a6a2335f2a0.pdf
84fec112d02288861e7af59f468131fb.docx
f6377999f8a5aa9a09b03e428ac93153.doc
a525f050cef10dee3a42468daec064ff.doc
bec0aa38d1383172690a18d16b07f154.doc
214d620d9c54bc83111277dd872d3cb2.pdf
d143c89d002fcef3e2bd2efdb4966f55.doc
2fd747f38e30ae7ce1c9d6e3b907ac5d.doc
4c2c295e81f4a6c3e669e8f76c6ce423.docx
64f58bc6e1207a570a38d771609b2cf1.docx
7ecd641f2ad81961c17455ed3ebeb2ab.doc
4e583dc5a5f1499fd2408f3152589f2d.doc
79104075f8b2ff971d51c495e67af52c.pdf
19e2becdb0f10e1c16a5a2460f3a84a2.pdf
0f7f507d0af90aba3c35484de016d8b4.doc
69ab7557dee21939aa7432b23a54cb2b.doc
8b82f3c800e486d9da9a13c98f7a40d6.doc
856860329f573bbaf158e1eafa885ba5.docx
35b6a0f57d909507c5aa9a8972b15f35.pdf
f28d4a853be12515dae73a5912bc5b41.doc
2c758805e2917306e6cbb079e2adcfcf.rtf
4db6b233fda895c3bffcb5fdc5b8e1de.rtf
18d573815c15b4e798bdfbfb52fb2f43.docx
d54c0b06162cf7cf57e8b1e7356aa204

In [102]:
df = pd.DataFrame.from_dict(data)
df["cls"] = df["class"].apply(lambda row: cls2index[row])
df

Unnamed: 0,filename,text,class,cls
0,5908cb5da047d6c9e6dfea6337fb3189.doc,\n﻿\t\tДОГОВОР \n\nг. Москва\n«__» ________ 20...,Договоры поставки,0
1,14711e4fc8e56f0c75856c8837ec04cb.doc,"\n﻿\n\nДоговор №______________\n\n\n Дата, мес...",Договоры поставки,0
2,7eb67b5aecf3f3190aab0a5f8ea32172.docx,ДОГОВОР ЗАКУПКИ № __________/\n\n\n\nг.\t\t\t\...,Договоры поставки,0
3,b40a9d048b199d5f4db62a6a2335f2a0.pdf,\n \n \nДОГОВОР ПОСТАВКИ № 1 \nг. Москва \n ...,Договоры поставки,0
4,84fec112d02288861e7af59f468131fb.docx,Договор № {НомерДокумента}\n\n\n\n{ДатаДокуме...,Договоры поставки,0
...,...,...,...,...
114,f57fe87f15a6dee2b17e804421be63b5.pdf,Страница 1 из 8 \n \n \nДОГОВОР КУПЛИ-ПРОДАЖИ ...,Договоры купли-продажи,4
115,1ea8809d696a4bd6a2076fbc6fd28c23.doc,\n﻿ДОГОВОР\n\nг. Москва\n«_____» _____________...,Договоры купли-продажи,4
116,57962abd8dbe0ce0c7056896ee4501f1.doc,\n﻿ПРЕДВАРИТЕЛЬНЫЙ ДОГОВОР\n\nг. Москва\n«____...,Договоры купли-продажи,4
117,2c81df29db63aebf495106881a52188f.doc,\n﻿ДОГОВОР\n№ __\n\nг.________________\t\t\t\t...,Договоры купли-продажи,4


In [103]:
cls = read_json("index2cls.json")
cls

{'0': 'Договоры поставки',
 '1': 'Договоры оказания услуг',
 '2': 'Договоры подряда',
 '3': 'Договоры аренды',
 '4': 'Договоры купли-продажи'}

In [104]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased-sentence", num_labels=len(cls)).to("cuda")

loading file vocab.txt from cache at /home/xrenya/.cache/huggingface/hub/models--DeepPavlov--rubert-base-cased-sentence/snapshots/78b5122d6365337dd4114281b0d08cd1edbb3bc8/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/xrenya/.cache/huggingface/hub/models--DeepPavlov--rubert-base-cased-sentence/snapshots/78b5122d6365337dd4114281b0d08cd1edbb3bc8/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/xrenya/.cache/huggingface/hub/models--DeepPavlov--rubert-base-cased-sentence/snapshots/78b5122d6365337dd4114281b0d08cd1edbb3bc8/tokenizer_config.json
loading configuration file config.json from cache at /home/xrenya/.cache/huggingface/hub/models--DeepPavlov--rubert-base-cased-sentence/snapshots/78b5122d6365337dd4114281b0d08cd1edbb3bc8/config.json
Model config BertConfig {
  "_name_or_path": "DeepPavlov/rubert-base-cased-sentence",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_p

In [105]:
for param in model.bert.parameters():
    param.requires_grad = False

In [106]:
X_train, X_test, y_train, y_test = train_test_split(df["text"].values.tolist(), df["cls"].values.tolist(), test_size=0.2, random_state=42)

In [107]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [108]:
X_train_clean = []
X_test_clean = []

for s in X_train:
    X_train_clean.append(text_preprocessing(s))

for s in X_test:
    X_test_clean.append(text_preprocessing(s))

In [76]:
tokens_train_ = tokenizer.batch_encode_plus(
    X_train_clean,
    padding=True,
    truncation=True
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [109]:
tokens_train = tokenizer.batch_encode_plus(
    X_train_clean,
    max_length=512,
    padding=True,
    truncation=True
)
tokens_test = tokenizer.batch_encode_plus(
    X_test_clean,
    max_length=512,
    padding=True,
    truncation=True
)

In [110]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Data(tokens_train, y_train)
test_dataset = Data(tokens_test, y_test)

In [111]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {'F1': f1}

In [112]:
training_args = TrainingArguments(
    output_dir='./results', #Выходной каталог
    num_train_epochs=150, #Кол-во эпох для обучения
    per_device_train_batch_size=96, #Размер пакета для каждого устройства во время обучения
    per_device_eval_batch_size=96, #Размер пакета для каждого устройства во время валидации
    weight_decay=0.00001, #Понижение весов
    logging_dir='./logs', #Каталог для хранения журналов
    load_best_model_at_end=True, #Загружать ли лучшую модель после обучения
    learning_rate=0.002, #Скорость обучения
    evaluation_strategy='epoch', #Валидация после каждой эпохи (можно сделать после конкретного кол-ва шагов)
    logging_strategy='epoch', #Логирование после каждой эпохи
    save_strategy='epoch', #Сохранение после каждой эпохи
    save_total_limit=1,
    seed=42)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [113]:
trainer = Trainer(model=model,
                  tokenizer=tokenizer,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=train_dataset,
                  compute_metrics=compute_metrics)

In [114]:
trainer.train()

***** Running training *****
  Num examples = 95
  Num Epochs = 150
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 150
  Number of trainable parameters = 3845


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 95
  Batch size = 96
Saving model checkpoint to ./results/checkpoint-1
Configuration saved in ./results/checkpoint-1/config.json
Model weights saved in ./results/checkpoint-1/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 95
  Batch size = 96
Saving model checkpoint to ./results/checkpoint-2
Configuration saved in ./results/checkpoint-2/config.json
Model weights saved in ./results/checkpoint-2/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 95
  Batch size = 96
Saving model checkpoint to ./results/checkpoint-3
Configuration saved in ./results/checkpoint-3/config.json
Model weights saved in ./results/chec

TrainOutput(global_step=150, training_loss=0.8601989014943441, metrics={'train_runtime': 705.6321, 'train_samples_per_second': 20.195, 'train_steps_per_second': 0.213, 'total_flos': 3749433530112000.0, 'train_loss': 0.8601989014943441, 'epoch': 150.0})

In [115]:
def get_prediction():
    test_pred = trainer.predict(test_dataset)
    labels = np.argmax(test_pred.predictions, axis=-1)
    return labels
pred = get_prediction()

***** Running Prediction *****
  Num examples = 24
  Batch size = 96


In [116]:
metrics(y_test, pred)

{'accuracy_score': 0.9583333333333334,
 'precision_score': 0.9583333333333334,
 'f1_score': 0.9583333333333334,
 'recall_score': 0.9583333333333334}

In [117]:
model_path = "finetunebert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in finetunebert/config.json
Model weights saved in finetunebert/pytorch_model.bin
tokenizer config file saved in finetunebert/tokenizer_config.json
Special tokens file saved in finetunebert/special_tokens_map.json


('finetunebert/tokenizer_config.json',
 'finetunebert/special_tokens_map.json',
 'finetunebert/vocab.txt',
 'finetunebert/added_tokens.json')