In [121]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import torch
import torch.nn as nn

from transformers import BertTokenizer
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
from transformers import BertTokenizerFast as BertTokenizer

import pytorch_lightning as pl
from ModelModule import ClassificationModule
from DataModule import AppsDataModule

## Для Bert (база)

In [68]:
def txt_reading():

    data_path = '****'
    files = os.listdir(data_path)
    
    try:
        main_df = pd.read_csv(data_path+files[0])
    except:
        main_df = pd.read_excel(data_path+files[0])
    
    #for file in tqdm(files[1:]):
    #    buf_df = pd.read_csv(data_path+file)
    #    main_df = pd.concat([main_df, buf_df], ignore_index=True, axis=0)
    
    main_df.dropna(ignore_index=True, inplace=True)
    main_df['txt'] = main_df['txt'].apply(lambda x: x.split('Содержание обращения: \n')[1] if 'Содержание обращения: \n' in x else x)
    main_df['txt'] = main_df['txt'].apply(lambda x: x.split('Приложение')[0] if 'Приложение' in x else x)
    main_df['txt'] = main_df['txt'].apply(lambda x: x.replace('\n', ''))
    main_df['txt'] = main_df['txt'].apply(lambda x: x.strip('/'))
    
    return main_df   

In [69]:
df = txt_reading()

In [70]:
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')



In [71]:
model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2')
out_features = model.bert.encoder.layer[1].output.dense.out_features
model.classifier = torch.nn.Linear(out_features, 2840)

classes = pd.read_excel('C:/Users/GorbachPP/PycharmProjects/BertFT/Классы.xlsx')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
model_state_dict = torch.load('C:/Users/GorbachPP/PycharmProjects/BertFT/checkpoints/exp_2/step=7245.ckpt')

In [73]:
bert_state_dict = {
    key.replace('model.', ''): value
    for key, value in model_state_dict["state_dict"].items()  if 'model.' in key
}

model.load_state_dict(bert_state_dict)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-

In [50]:
results = []

for i in tqdm(range(df.shape[0])):
    encoding = tokenizer.encode_plus(
            df.txt.iloc[i],
            add_special_tokens=True,
            max_length=1024,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
    
    out = {
              'text': df.txt.iloc[i],
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
    
    input_ids = out["input_ids"]
    attention_mask = out["attention_mask"]
    
    outputs = model(
        input_ids=input_ids.unsqueeze(0),
        attention_mask=attention_mask.unsqueeze(0)
    )
    
    prediction = torch.argmax(outputs.logits, dim=1).numpy()[0]
    
    result = classes[classes['Класс']==prediction]['Код'].iloc[0]
    results.append(result)

100%|████████████████████████████████████████████████████████████████████████████████| 956/956 [01:47<00:00,  8.87it/s]


In [439]:
df['results'] = results

In [440]:
df['code_parts'] = df['code'].apply(lambda x: x.split('.')[:-1])
df['res_parts'] = df['results'].apply(lambda x: x.split('.')[:-1])

In [441]:
good = 0
for i in range(df.shape[0]):
    if df['code_parts'].iloc[i][0] == df['res_parts'].iloc[i][0]:
        good+=1

print(f'first part accuracy: {good/df.shape[0]}')

good = 0
for i in range(df.shape[0]):
    if (df['code_parts'].iloc[i][0] == df['res_parts'].iloc[i][0]) and (df['code_parts'].iloc[i][1] == df['res_parts'].iloc[i][1]):
        good+=1

print(f'first+second part accuracy: {good/df.shape[0]}')

good = 0
for i in range(df.shape[0]):
    if (df['code_parts'].iloc[i][0] == df['res_parts'].iloc[i][0]) and (df['code_parts'].iloc[i][1] == df['res_parts'].iloc[i][1]) and (df['code_parts'].iloc[i][2] == df['res_parts'].iloc[i][2]):
        good+=1

print(f'first+second+third part accuracy: {good/df.shape[0]}')

print(f'full accuracy: {df[df['code']==df['results']].shape[0]/df.shape[0]}')

first part accuracy: 0.6705020920502092
first+second part accuracy: 0.5679916317991632
first+second+third part accuracy: 0.25732217573221755
full accuracy: 0.1997907949790795


In [442]:
#Значения предыдущего чекпоинта
#first part accuracy: 0.6746861924686193
#first+second part accuracy: 0.5679916317991632
#first+second+third part accuracy: 0.26778242677824265
#full accuracy: 0.2102510460251046

## Топ-5

In [443]:
top_5_results = []

for i in tqdm(range(df.shape[0])):
    encoding = tokenizer.encode_plus(
            df.txt.iloc[i],
            add_special_tokens=True,
            max_length=1024,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
    
    out = {
              'text': df.txt.iloc[i],
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
    
    input_ids = out["input_ids"]
    attention_mask = out["attention_mask"]
    
    outputs = model(
        input_ids=input_ids.unsqueeze(0),
        attention_mask=attention_mask.unsqueeze(0)
    )
    
    prediction = torch.topk(outputs.logits, 5, dim=1).indices.numpy()[0]

    predictions = []
    for pred in prediction:
        predictions.append(classes[classes['Класс']==pred]['Код'].iloc[0])
        
    top_5_results.append(predictions)

100%|████████████████████████████████████████████████████████████████████████████████| 956/956 [02:53<00:00,  5.50it/s]


In [444]:
top_5_results

[['1.1.8.1.', '1.1.8.', '1.1.8.2.', '1.1.8.4.', '19.3.2.1.'],
 ['1.1.11.1.', '1.1.10.1.', '1.1.10.2.', '2.1.6.1.', '1.2.11.1.'],
 ['1.1.17.', '10.4.24.', '10.4.3.', '1.1.13.3.', '15.3.17.1.'],
 ['2.1.3.1.', '2.1.3.', '2.1.4.', '1.1.8.1.', '1.1.8.'],
 ['1.2.8.2.', '1.2.8.1.', '1.2.8.', '1.2.11.1.', '1.2.9.'],
 ['3.4.1.', '3.4.7.', '3.4.4.1.', '3.4.9.', '3.4.13.1.'],
 ['1.1.12.1.', '1.1.9.', '1.1.17.', '1.1.12.2.', '1.1.1.3.'],
 ['10.4.9.', '10.4.24.', '10.4.3.', '10.3.7.', '10.4.20.'],
 ['1.2.7.', '1.1.7.', '1.2.13.1.', '1.2.22.', '3.4.7.'],
 ['6.9.7.', '6.4.7.', '6.5.7.', '10.4.10.', '1.1.7.'],
 ['1.2.9.', '1.2.1.3.', '1.2.8.1.', '1.2.8.', '1.2.8.2.'],
 ['1.2.8.1.', '1.2.8.2.', '1.2.9.', '1.2.8.', '1.2.1.3.'],
 ['1.2.8.1.', '1.2.8.2.', '1.2.9.', '1.2.8.', '1.2.1.3.'],
 ['1.1.12.1.', '2.1.16.', '5.1.5.1.', '1.1.17.', '1.1.36.'],
 ['1.2.19.3.', '1.2.19.2.', '1.1.19.3.', '1.1.19.2.', '1.1.19.1.'],
 ['1.2.11.3.', '1.2.8.2.', '1.2.11.1.', '1.2.8.1.', '1.2.8.'],
 ['1.1.8.1.', '1.1.8.2.', '1.

In [445]:
df['top_5_results'] = top_5_results

In [446]:
good = 0
for i in range(df.shape[0]):
    if df['code'].iloc[i] in df['top_5_results'].iloc[i]:
        good+=1

print(f'"top 5" acc: {good/df.shape[0]}')

"top 5" acc: 0.42573221757322177
