# Классификация договоров по риску - нет риска, есть риск, аренда (втф)

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from typing import *
import re, gc

In [3]:
from sklearn.model_selection import train_test_split
from babel.dates import format_date, format_datetime, format_time
import time

In [4]:
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup, AdamW

In [5]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch
import torch.optim as optim

In [6]:
import nltk
import pymystem3
import gensim
import pymorphy2
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import SnowballStemmer

In [8]:
tokenizer_sber = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model_sber = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")

* здесь используются выжимки из python-docx абзац наиболее важной части контракта (не шаблонной)

In [9]:
data = pd.read_excel(r'model_out_total.xlsx')
data = data[['Шаблон', 'Флаг']]

In [10]:
morph = pymorphy2.MorphAnalyzer()

In [11]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stanislavilusin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stanislavilusin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

In [14]:
russian_stopwords.extend(['один','два',"три","четыре","пять","шесть","семь","восемь","девять","сто","двести",
                          "триста","четыреста","пятьсот","шестьсот","восемьсот","девятьсот","миллион","рубль","копейка",
                         'семьсот',"десять","двадцать","тридцать","сорок","пятьдесят","шестьдесят","семьдесят","восемьдесят",
                          "девяносто","тысяча","одиннадцать","двенадцать","тринадцать","четырнадцать","пятнадцать",
                          "шестнадцать","семнадцать","восемнадцать", "девятнадцать"])

In [15]:
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=|_|__|___)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower())+ ' '.join(emoticons).replace('-', '')).replace('_', '')
    
    tokens = []
    for sent in nltk.sent_tokenize(text, language = 'russian'):
        for word in nltk.word_tokenize(sent, language = 'russian'):
            if len(word) <= 3:
                continue
            word = morph.parse(word.lower())[0].normal_form
            tokens.append(word.lower())
    return tokens


In [16]:
for index, row in data.iterrows():
    data['Шаблон'].loc[index] = ' '.join(preprocess_text(row['Шаблон']))

In [17]:
data['Флаг'].value_counts()

постоплата    252
предоплата    244
помесячно     141
Name: Флаг, dtype: int64

In [18]:
y_risk = np.array(data['Флаг'])
y_risk = np.where(y_risk == 'постоплата' ,int(0), y_risk)
y_risk = np.where(y_risk == 'предоплата' ,int(1), y_risk)
y_risk = np.where(y_risk == 'помесячно' ,int(2), y_risk)
y_risk = y_risk.astype('int')
data['Флаг'] = y_risk

In [19]:
val_text, test_text, val_labels, test_labels = train_test_split(data['Шаблон'], data['Флаг'], 
                                                                random_state=2022, 
                                                                test_size=0.2, 
                                                                stratify=data['Флаг'])

In [26]:
text = ["Привет меня зовут Стас", "Это модель сбербанка", "Здесь я выведу хрень хрень что-то"]
sent_id = tokenizer_sber.batch_encode_plus(text, padding=True, return_token_type_ids = True, max_length = 2)

In [27]:
print(sent_id)

{'input_ids': [[101, 6571, 1024, 10160, 791, 381, 102, 0, 0, 0, 0], [101, 736, 7537, 83492, 9408, 102, 0, 0, 0, 0, 0], [101, 1640, 119, 14077, 724, 46779, 46779, 693, 133, 696, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [28]:
max_seq_len = 512 #?

In [23]:
# add_special_tokens=True 
# padding="longest"
# return_attention_mask=True
# pad_to_max_length=True

In [29]:
tokens_val = tokenizer_sber.batch_encode_plus(val_text.tolist(), max_length = max_seq_len, padding=True,
    truncation=True,
    return_token_type_ids=False)
tokens_test = tokenizer_sber.batch_encode_plus(test_text.tolist(), max_length = max_seq_len,padding=True,
    truncation=True,
    return_token_type_ids=False)

In [30]:
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [31]:
batch_size = 8 #?


# tensor
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler loader
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [32]:
# tensor
test_data = TensorDataset(test_seq, test_mask, test_y)

# sampler loader
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

In [33]:
for param in model_sber.parameters():
    param.requires_grad = False

In [82]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert 
        self.conv = nn.Conv2d(in_channels=25, out_channels= 25, kernel_size= (3,1024),stride = (4,1), padding = (2,1)) #, stride = (5,2), padding = (3,3)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=1, padding = 1)#, padding = (1,1)
        self.flat = nn.Flatten()
        self.dropout = nn.Dropout(.05)
        self.relu =  nn.SELU()
        self.fc1 = nn.Linear(2475,256)
        self.fc2 = nn.Linear(256,3)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, _, all_layers = self.bert(sent_id, attention_mask=mask, output_hidden_states=True, return_dict=False) #return_dict=False)
        
        global test_bert
        
        test_bert =  self.bert(sent_id, attention_mask=mask, output_hidden_states=True, return_dict=False)
        
        a1,a2,a3 = self.bert(sent_id, attention_mask=mask, output_hidden_states=True, return_dict=False)
        print(a1.shape,a2.shape) # (8,8,1024) (8,1024)
        x = torch.cat(all_layers, 0)
        x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
        del all_layers
        gc.collect()
        torch.cuda.empty_cache()
        x = self.conv(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.pool(x)
        x = self.flat(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        #x = self.pool(self.dropout(self.relu(self.conv(self.dropout(x)))))
        #x = self.fc(self.dropout(self.flat(self.dropout(x))))
        return x

In [83]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')

In [84]:
model = BERT_Arch(model_sber)
model = model.to(device)

In [85]:
optimizer = AdamW(model.parameters(), lr = 1e-3)

In [86]:
from sklearn.utils.class_weight import compute_class_weight

class_wts = compute_class_weight(class_weight = 'balanced', classes = np.unique(val_labels), y = val_labels)

print(class_wts)

[0.84411277 0.87008547 1.50147493]


In [87]:
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)
cross_entropy  = nn.NLLLoss(weight=weights) 
epochs = 70

In [88]:
def train():
  
    model.train()

    total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
    total_preds=[]
  
  # iterate over batches
    for step,batch in enumerate(val_dataloader):
    
    # progress update after every 50 batches.
       if step % 10 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
       batch = [r.to(device) for r in batch]
 
       sent_id, mask, labels = batch
        
    
    # clear previously calculated gradients 
       model.zero_grad()        

    # get model predictions for the current batch
       
        
       preds = model(sent_id, mask)


    # compute the loss between actual and predicted values
       loss = cross_entropy(preds, labels)

    # add on to the total loss
       total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
       loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
       torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
       optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
       preds=preds.detach().cpu().numpy()

    # append the model predictions
       total_preds.append(preds)

  # compute the training loss of the epoch
    avg_loss = total_loss / len(val_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
    return avg_loss, total_preds

def evaluate():
  
    print("\nEvaluating...")
    
    t0 = time.time()
  
  # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
    total_preds = []

  # iterate over batches
    for step,batch in enumerate(test_dataloader):
        if step % 10 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0, format = 'medium', locale='rus')
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(test_dataloader)))
            print(elapsed)

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch
    
    

    # deactivate autograd
    with torch.no_grad():
        preds = model(sent_id, mask)
        loss = cross_entropy(preds,labels)
        total_loss = total_loss + loss.item()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)

  # compute the validation loss of the epoch
    avg_loss = total_loss / len(test_dataloader) 
    avg_loss = np.log(avg_loss)

  # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

train_losses=[]
valid_losses=[]

for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    train_loss, _ = train()
    valid_loss, _= evaluate()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print('YES!')
        torch.save(model.state_dict(), 'saved_weights_sber_16_100_new_cnn_linear.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [203]:
path = 'saved_weights_sber_16_100_new_cnn_linear.pt'
model.load_state_dict(torch.load(path))
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

### Final - test

In [204]:
from sklearn.metrics import classification_report, confusion_matrix
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.63      0.80      0.71        51
           1       0.78      0.71      0.74        49
           2       0.89      0.57      0.70        28

    accuracy                           0.72       128
   macro avg       0.77      0.70      0.72       128
weighted avg       0.74      0.72      0.72       128



In [205]:
print(confusion_matrix(test_y, preds))

[[41  8  2]
 [14 35  0]
 [10  2 16]]


In [41]:
with torch.no_grad():
    preds_val = model(val_seq.to(device), val_mask.to(device))
    preds_val = preds_val.detach().cpu().numpy()

In [43]:
from sklearn.metrics import classification_report, confusion_matrix
preds_val = np.argmax(preds_val, axis = 1)
print(classification_report(val_y, preds_val))

              precision    recall  f1-score   support

           0       0.80      0.69      0.74       118
           1       0.78      0.87      0.82       127
           2       0.83      0.85      0.84        59

    accuracy                           0.80       304
   macro avg       0.80      0.80      0.80       304
weighted avg       0.80      0.80      0.79       304



In [44]:
print(confusion_matrix(val_y, preds_val))

[[ 82  28   8]
 [ 15 110   2]
 [  6   3  50]]


In [2]:
### white board

In [8]:
def conv_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
    
    if type(h_w) is not tuple:
        h_w = (h_w, h_w)
    
    if type(kernel_size) is not tuple:
        kernel_size = (kernel_size, kernel_size)
    
    if type(stride) is not tuple:
        stride = (stride, stride)
    
    if type(pad) is not tuple:
        pad = (pad, pad)
    
    h = (h_w[0] + (2 * pad[0]) - (dilation * (kernel_size[0] - 1)) - 1)// stride[0] + 1
    w = (h_w[1] + (2 * pad[1]) - (dilation * (kernel_size[1] - 1)) - 1)// stride[1] + 1
    
    return h, w

In [9]:
conv_output_shape(h_w = (256,1), kernel_size = (8,1024), stride = (4,1), pad = (2,1), dilation = 0)

(65, 3)

In [14]:
conv_p = nn.Conv2d(8, 16, kernel_size = (3,3))
hh = torch.randn(5,8,34,34)
conv_ = conv_p(hh)
conv_.shape

torch.Size([5, 16, 32, 32])

In [23]:
bn2 = nn.BatchNorm2d(num_features=128)
bn2_ = bn2(conv_)
bn2_.shape

torch.Size([16, 128, 56, 56])

In [17]:
max_p = nn.MaxPool2d(kernel_size = 2)
hh = torch.randn(5, 32, 14 , 14)

out_max = max_p(hh)

flat_p = nn.Flatten()

out_max.size()

torch.Size([5, 32, 7, 7])

In [186]:
max_p(hh).shape

torch.Size([256, 1024, 1, 1])

In [102]:
conv_t = nn.ConvTranspose2d(in_channels = 448 , out_channels = 512, kernel_size=4, stride=4, padding=1)

In [103]:
ten = torch.rand(16,448,4,4)

In [104]:
conv_t(ten).shape

torch.Size([16, 512, 14, 14])