In [None]:
!pip install tensorflow_text natasha

Collecting tensorflow_text
  Downloading tensorflow_text-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting natasha
  Downloading natasha-1.6.0-py3-none-any.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy2 (from natasha)
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting razdel>=0.5.0 (from natasha)
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting navec>=0.9.0 (from natasha)
  Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Collecting slovnet>=0.6.0 (from natasha)
  Downloading slovnet-0.6.0-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import re
import time

import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup  # Для удаления HTML тегов

import tensorflow as tf
from tensorflow.keras.saving import load_model
import tensorflow_hub as hub
import tensorflow_text as text

from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    PER,
    NamesExtractor,
    DatesExtractor,
    MoneyExtractor,
    AddrExtractor,
    Doc
)
from google.colab import drive
from matplotlib import pyplot as plt
import joblib

# Dataset_import

In [None]:
# вход на файл
drive.mount("/content/drive")
DIR = '/content/drive/My Drive/Colab Notebooks/'
df = pd.read_excel(DIR + "CRA_train_1200.xlsx")

Mounted at /content/drive


# Models loading
<b style="color:red"> warning! it cat take a lot of time! </b>

In [None]:
model_cat = load_model(DIR + 'model_cat_3.keras', custom_objects={'KerasLayer': hub.KerasLayer})
model_rat = load_model(DIR + 'model_rat_3.keras', custom_objects={'KerasLayer': hub.KerasLayer})

In [None]:
le_cat = joblib.load(DIR + 'label_encoder_7.joblib')
le_rat = joblib.load(DIR + 'label_encoder_17.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Inicialize class

In [None]:
class Nlp:
  def __init__(self):
        # Инициализация объектов, необходимых для обработки текста с использованием Natasha и Spacy.
        self.segmenter = Segmenter()  # Сегментация текста на предложения.
        self.morph_vocab = MorphVocab()  # Создание словаря морфологических данных.
        self.emb = NewsEmbedding()  # Получение векторных представлений слов.
        self.morph_tagger = NewsMorphTagger(self.emb)  # Морфологическая разметка текста.
        self.ner_tagger = NewsNERTagger(self.emb)  # Распознавание именованных сущностей.
        self.names_extractor = NamesExtractor(self.morph_vocab)  # Извлечение имен.
        self.stop_words = ['АО «Эксперт РА', 'АКРА', 'Компания', 'Группа', 'Эксперт РА', 'Рейтинговое агентство', 'АО Эксперт  РА', 'Кредитные', 'Оценка внешнее влияние', 'Группа.']


    # Очищаем текст регулярными выражениями
  def clear_text(self, text):
      soup = BeautifulSoup(text)
      text = soup.get_text()
      text = re.sub(r'(http\S+)|(www\S+)|([\w\d]+www\S+)|([\w\d]+http\S+)', '', text)
      text = re.sub(r'[\n\t\«]', ' ', text).strip()  # Перенос, табуляция
      text = re.sub(r'[^\w\d\s\.\,\"]', ' ', text)  # Только слова, цифры, пробелы, точки и запятые
      text = re.sub(r'\s+', ' ', text)  # Удаляем двойные пробелы
      pat = "\s+([{}]+)".format(re.escape("\.\,"))
      text = re.sub("\s{2,}", " ", re.sub(pat, r"\1", text))
      return text

  def get_features(self, text):
      doc = Doc(text)  # Создание объекта Doc для текста.
      doc.segment(self.segmenter)  # Сегментация текста на предложения.
      doc.tag_morph(self.morph_tagger)  # Морфологическая разметка текста.
      doc.tag_ner(self.ner_tagger)
      words = [token.text for token in doc.tokens if token.pos not in ["ADP", "PUNCT", "NUM", "CCONJ", 'PROPN']]
      count = len(words)  # Количество слов в строке
      if count > 0:
          average = sum(len(word) for word in words) / count
          uniq = round(100*len(set(words))/count)  # % уникальных слов в строке
      for span in doc.spans:
        span.normalize(self.morph_vocab)  # Нормализация именованных сущностей.
      ners = [(span.normal, span.type) for span in doc.spans if span.normal not in self.stop_words]
      counted_ners = self.ners_counter(ners)
      features_list = [count, average, uniq, counted_ners["ORG"], counted_ners["LOC"], counted_ners["PER"]]
      return features_list

  def ners_counter(self, ners):
      counted_ners = {"ORG": 0, "LOC": 0, "PER": 0}
      for ner in ners:
          counted_ners[ner[1]] += 1
      return counted_ners

  def prediction_pipeline(self, df):
    copied_df = df["pr_txt"].copy()
    cleared_text_list = list(copied_df.map(self.clear_text))
    df_text = pd.DataFrame(cleared_text_list, columns=['text'])
    x_features = np.array(list(df_text['text'].map(nlp.get_features)))
    df_features = pd.DataFrame(x_features, columns=['count', 'average', 'uniq', 'org', 'loc', 'per'])
    pred_cat_v = model_cat.predict([df_text, df_features])
    pred_rat_v = model_rat.predict([df_text, df_features])
    pred_cat_am = np.argmax(pred_cat_v, axis=1)
    pred_rat_am = np.argmax(pred_rat_v, axis=1)
    pred_cat_y = le_cat.inverse_transform(pred_cat_am)
    pred_rat_y = le_rat.inverse_transform(pred_rat_am)
    answer = pd.DataFrame({'Категория': pred_cat_y, 'Уровень рейтинга': pred_rat_y})
    out = pd.concat([copied_df, answer], axis=1)
    out.to_excel(DIR + 'submit.xlsx', index=False)

In [None]:
nlp = Nlp()

In [None]:
nlp.prediction_pipeline(df)