## Обучение BERT - модели для решения задачи классификации

In [None]:
import pandas as pd
import numpy as np
import spacy
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report

from spacy.training import Example

from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer

comment_data = pd.read_excel('values_df.xlsx')
comment_data = shuffle(comment_data)

# Подготовка данных
encoder = LabelEncoder()
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

X = comment_data['text']
y = comment_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify = comment_data['class'])
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

# Сompute class frequencies
# class_counts = Counter(y_train)
# total_samples = len(y_train)
# class_weights = torch.tensor([total_samples / (class_counts[i] * len(class_counts)) for i in range(len(class_counts))], dtype = torch.float)

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels = 3)

# Токенизация и преобразование комментариев
train_encodings = tokenizer(list(X_train), truncation = True, padding = True)
test_encodings = tokenizer(list(X_test), truncation = True, padding = True)

# Создание тензоров pytorch для входных данных, dtype = torch.long
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(y_train.values),
                            #   class_weights[y_train.values]
                              )

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(y_test.values))

# Создание dataloader для обучающего и тестового набора данных
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = False)

# Установка GPU
device = torch.device('cuda')
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
# loss_fn = torch.nn.CrossEntropyLoss(weight = class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

# Обучение модели
for epoch in range(10):
  model.train()
  for batch in train_loader:
    input_ids, attention_mask, labels = (item.to(device) for item in batch)
    # input_ids, attention_mask, labels = (item for item in batch)
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

model.eval()
predictions = []
with torch.no_grad():
  for batch in test_loader:
    input_ids, attention_mask, _ = (item.to(device) for item in batch)
    # input_ids, attention_mask, _ = (item for item in batch)
    outputs = model(input_ids, attention_mask = attention_mask)
    _, predicted_labels = torch.max(outputs.logits, dim = 1)
    predictions.extend(predicted_labels.cpu().numpy())

accuracy = accuracy_score(y_test, predictions)
accuracy

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0.8341708542713567

In [None]:
y_test = encoder.inverse_transform(y_test)
predictions = encoder.inverse_transform(predictions)

results_df = pd.DataFrame({'Comment': X_test, 'True_Label': y_test, 'Predicted_Label': predictions})
print(classification_report(results_df['True_Label'], results_df['Predicted_Label']))

              precision    recall  f1-score   support

    negative       0.82      0.94      0.88        82
     neutral       0.94      0.52      0.67        29
    positive       0.83      0.84      0.84        88

    accuracy                           0.83       199
   macro avg       0.86      0.77      0.79       199
weighted avg       0.84      0.83      0.83       199



In [None]:
# Сохранение модели и токенизатора
model.save_pretrained('saved_model')
tokenizer.save_pretrained('saved_model')

('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json')

## Дообучение BERT - модели

In [None]:
# Обучение Енкодера
name = '\\values_df.xlsx'
comment_data = pd.read_excel(path + name)
comment_data = shuffle(comment_data)
encoder = LabelEncoder()
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

name = '\\bert-further-train.xlsx'
comment_data = pd.read_excel(path + name)
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

X = comment_data['text']
y = comment_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify = comment_data['class'])

name = '\\bert_new'
tokenizer = BertTokenizer.from_pretrained(path + name)
model = BertForSequenceClassification.from_pretrained(path + name)

# Токенизация и преобразование комментариев
train_encodings = tokenizer(list(X_train), truncation = True, padding = True)
test_encodings = tokenizer(list(X_test), truncation = True, padding = True)

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids'], dtype=torch.long),
                              torch.tensor(train_encodings['attention_mask'], dtype=torch.long),
                              torch.tensor(y_train.values, dtype=torch.long),
                            #   class_weights[y_train.values]
                              )

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids'], dtype=torch.long),
                             torch.tensor(test_encodings['attention_mask'], dtype=torch.long),
                             torch.tensor(y_test.values, dtype=torch.long))
# Создание dataloader для обучающего и тестового набора данных
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = False)

# Установка GPU
# device = torch.device('cuda')
# model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()
# loss_fn = torch.nn.CrossEntropyLoss(weight = class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

# Обучение модели
for epoch in range(6): # Количество эпох архиважно, лучше не трогай
  model.train()
  for batch in train_loader:
    # input_ids, attention_mask, labels = (item.to(device) for item in batch)
    input_ids, attention_mask, labels = (item for item in batch)
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

model.eval()
predictions = []
with torch.no_grad():
  for batch in test_loader:
    # input_ids, attention_mask, _ = (item.to(device) for item in batch)
    input_ids, attention_mask, _ = (item for item in batch)
    outputs = model(input_ids, attention_mask = attention_mask)
    _, predicted_labels = torch.max(outputs.logits, dim = 1)
    predictions.extend(predicted_labels.cpu().numpy())

accuracy = accuracy_score(y_test, predictions)
accuracy

In [None]:
# Сохранение модели и токенизатора
name = '\\bert_april'
model.save_pretrained(path + name)
tokenizer.save_pretrained(path + name)

## Проверка точности модели

In [None]:
name = '\\bert_april'
model = BertForSequenceClassification.from_pretrained(path + name)
tokenizer = BertTokenizer.from_pretrained(path + name)

# Обучение Енкодера
name = '\\values_df.xlsx'
comment_data = pd.read_excel(path + name)
comment_data = shuffle(comment_data)
encoder = LabelEncoder()
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

def get_prediction(text):
    # Преобразование текста в токены
    inputs = tokenizer(text, return_tensors = 'pt')
    # Подача токенизированных данных в модель для получения предсказания
    outputs = model(**inputs)
    # Получение предсказанных классов в виде вероятностей
    predicted_probabilities = torch.nn.functional.softmax(outputs.logits, dim = -1)
    # Выбор индекса с максимальной вероятностью
    predicted_class = torch.argmax(predicted_probabilities, dim = -1).item()
    return encoder.inverse_transform([predicted_class])

def evaluate_model(comment_data):
    correct_predictions = 0
    total_predictions = len(comment_data)

    for index, row in comment_data[['text', 'sentiment']].iterrows():
        text = row['text']
        true_class = row['sentiment']
        predicted_class = get_prediction(text)[0]

        if predicted_class == true_class:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = evaluate_model(comment_data[['text', 'sentiment']])
accuracy # 0.9435215946843853

## Дообучение NLP - модели

In [None]:
nlp = spacy.load('ru_core_news_md')
ner = nlp.get_pipe('ner')
ner.add_label('PROD') # 1

data_to_train = pd.read_excel('train_data.xlsx')

# ФУНКЦИЯ ПРИВЕДЕНИЯ ДАННЫХ К ТИПУ ТЕКСТ - СУЩНОСТЬ
def get_train_data(data_to_train):
    train_data = []
    for index, row in data_to_train.iterrows():
        text = row.text
        selected_text = row.selected_text
        entity_list = []
        for elem in selected_text.split(sep = ','):
            elem = elem.strip()
            start = text.find(elem)
            end = start + len(elem)
            entity_list.append((start, end, 'PROD'))
        train_data.append((text, {'entities': entity_list}))
    return train_data

# ФУНКЦИЯ ОЧИТКИ ДАННЫХ
def datacleaner(text):
    if type(text) = str:
        words = []
        doc = nlp(text)
        for token in doc:
            if token.is_alpha or (token.text == ',') or (token.text == '.') or (token.text == '+'):
                words.append(token.text)
        return ' '.join(words)
    return text

# ПРЕДВАРИТЕЛЬНАЯ ОЧИСТКА ДАННЫХ
data_to_train['text'] = data_to_train['text'].apply(datacleaner)

# ПОЛУЧЕНИЕ ДАННЫХ
train_data = get_train_data(data_to_train)

# ОБУЧЕНИЕ
for i in range(10):
    train_data = shuffle(train_data)
    losses = {}
    try:
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.1, losses=losses)
    except ValueError as e:
        print(f'Ошибка при обновлении модели: {e}')
    print('Эпоха', i, ':', losses)

In [None]:
name = '\\nlp_trained'
nlp.to_disk(path + name)

## Использование двух моделей на боевых данных

In [None]:
# Загрузка BERT-модели и токенизатора
name = '\\bert_april'
model = BertForSequenceClassification.from_pretrained(path + name)
tokenizer = BertTokenizer.from_pretrained(path + name)

# Загрузка NLP-модели
nlp = spacy.load(path + name)

def datacleaner(text):
    words = []
    doc = nlp(text)
    for token in doc:
        if token.is_alpha or (token.text == ',') or (token.text == '.') or (token.text == '+'):
            words.append(token.text)
    return ' '.join(words)

deposit_list = ['деп', 'депоз', 'депозит', 'депы', 'депозы'] #Депозит
credit_list = ['кик', 'вкл', 'нкл', 'кредитный', 'кредит', 'кредитоваться', 'овер', 'овера', 'оверу', 'овердрафт', #Кредит
               'ВКЛ', 'кредитование', 'кд', 'линия', 'кк']
bill_list = ['рс', 'счет', 'рко'] #РКО
bg_list = ['гарантия', 'бг', 'Гарантия'] #Гарантия
lease_list = ['лизинг'] #Лизинг
ekviring_list = ['эквайрингу', 'эквайринг'] #Эквайринг
product_list = ['продукт'] #Продукты
secure_list = ['страховать', 'страхование', 'страх', 'страховка'] #Страхование
salary_list = ['зп', 'зпп'] #ЗПП
finance_list = ['финансирование'] #Финансирование
factoring_list = ['факторинг', 'факторингу', 'фактор'] #Факторинг
ved_list = ['вэд', 'вэду', 'вед', 'веду'] #ВЭД
ostatok_list = ['остаток', 'остатки'] #Остатки

def rebuild_col(column_dict):
    newborn = {}
    for key in column_dict.keys():
        if (key in deposit_list) and ('Депозит' not in newborn.keys()):
            newborn['Депозит'] = column_dict[key]
        if (key in credit_list) and ('Кредит' not in newborn.keys()):
                newborn['Кредит'] = column_dict[key]
        if (key in bg_list) and ('Гарантия' not in newborn.keys()):
            newborn['Гарантия'] = column_dict[key]
        if (key in lease_list) and ('Лизинг' not in newborn.keys()):
            newborn['Лизинг'] = column_dict[key]
        if (key in ekviring_list) and ('Эквайринг' not in newborn.keys()):
            newborn['Эквайринг'] = column_dict[key]
        if (key in product_list) and ('Продукты' not in newborn.keys()):
            newborn['Продукты'] = column_dict[key] # Ничего, Ничто, Желания ?
        if (key in secure_list) and ('Страхование' not in newborn.keys()):
            newborn['Страхование'] = column_dict[key]
        # if key in finance_list:
            # newborn['Финансирование'] = column_dict[key]
        if (key in salary_list) and ('ЗПП' not in newborn.keys()):
            newborn['ЗПП'] = column_dict[key]
        if (key in factoring_list) and ('Факторинг' not in newborn.keys()):
            newborn['Факторинг'] = column_dict[key]
        if (key in bill_list) and ('РКО' not in newborn.keys()):
            newborn['РКО'] = column_dict[key]
        if (key in ved_list) and ('ВЭД' not in newborn.keys()):
            newborn['ВЭД'] = column_dict[key]
        if (key in ostatok_list) and ('Остатки' not in newborn.keys()):
            newborn['Остатки'] = column_dict[key]
    return newborn

# ОСТАВЛЯЕМ ТОЛЬКО СУЩНОСТИ И ИХ ОКРУЖЕНИЕ
def entlist_to_form(comment):
    comment = comment.lower()
    partlist = []
    entlist = []
    # Пустая строка
    current_part = ''
    doc = nlp(comment)
    for token in doc:
        if (token.text == ',') or (token.text == '.') or (token.text == '+'):
            partlist.append(current_part)
            current_part = ''
        else:
            current_part += token.text
            current_part += ' '
    # Если строка НЕ пустая, добавим часть предложения, в список
    if current_part:
        partlist.append(current_part)
    # Пройдём по ранее заготовленному списку partlist
    for part in partlist:
        for ent in doc.ents: # doc = nlp(text)
            if ent.label_ == 'PROD' and ent.text in part:
                entlist.append({ent.lemma_: part}) #ПРИВОДИМ К ФОРМАТУ СУЩНОСТЬ-ЗНАЧЕНИЕ
    return entlist

def get_prediction(text):
    # Преобразование текста в токены
    inputs = tokenizer(text, return_tensors = 'pt')
    # Подача токенизированных данных в модель для получения предсказания
    outputs = model(**inputs)
    # Получение предсказанных классов в виде вероятностей
    predicted_probabilities = torch.nn.functional.softmax(outputs.logits, dim = -1)
    # Выбор индекса с максимальной вероятностью
    predicted_class = torch.argmax(predicted_probabilities, dim = -1).item()
    return encoder.inverse_transform([predicted_class])

def form_sentiment_data(comment_data):
    sentiment_dict = {}
    # На вход подаём список из словарей
    for elem in comment_data:
        # Проходим по каждому из словарей списка и формируем sentiment_dict
        for key, value in elem.items():
            if key not in sentiment_dict:
                sentiment_dict[key] = get_prediction(value)
    return sentiment_dict

def form_comment_data(comment_data):
    comment_dict = {}
    # На вход подаём список из словарей
    for elem in comment_data:
        # Проходим по каждому из словарей списка и формируем уже comment_dict
        for key, value in elem.items():
            if key not in comment_dict: # Если ключ ещё не добавлен, то добавляем -> сохраняем первое упоминание
                comment_dict[key] = value
    return comment_dict

test_data = result[['ИНН', 'Дата изменения', 'Описание']].copy()

# ПРЕДОБРАБОТКА КОММЕНТАРИЯ ИЗ БАЗЫ
test_data = test_data[~test_data['Описание'].isna()]

# Обучение Енкодера
name = '\\values_df.xlsx'
comment_data = pd.read_excel(path + name)
comment_data = shuffle(comment_data)
encoder = LabelEncoder()
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

test_data['sentiment'] = test_data['entlist'].apply(form_sentiment_data)
test_data = test_data[test_data['sentiment'].apply(lambda x: bool(x))]
test_data['sentiment'] = test_data['sentiment'].apply(rebuild_col)
test_data = test_data[test_data['sentiment'].apply(lambda x: bool(x))]

test_data['comment'] = test_data['entlist'].apply(form_comment_data)
test_data['comment'] = test_data['comment'].apply(rebuild_col)

test_data[['INN', 'Дата изменения', 'Описание', 'comment', 'sentiment']]