In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, default_data_collator, DebertaV2Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
from tqdm.notebook import tqdm
import torch 
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostClassifier
import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import numpy as np
import requests
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import wordcloud
import string

import re
from functools import reduce
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from spacy.lang.en import English
from sklearn.model_selection import train_test_split

import spacy


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
train['text_Clean'].values[:10]

In [None]:
#WordCloud for the Sentiments
from wordcloud import WordCloud
for label, cmap in zip([1, 2, 3, 4, 5],
                       ['winter', 'autumn', 'magma', 'viridis', 'plasma']):
    text = train.query('label == @label')['text'].str.cat(sep=' ')
    plt.figure(figsize=(10, 6))
    wc = WordCloud(width=1000, height=600, background_color="#f8f8f8", colormap=cmap)
    wc.generate_from_text(text)
    plt.imshow(wc)
    plt.axis("off")
    plt.title(f"Words Commonly Used in ${label}$ Messages", size=20)
    plt.show()

In [None]:
reg = CatBoostClassifier(iterations = 1000,
                      loss_function='MultiClass',
                      l2_leaf_reg=1,
                      learning_rate=0.05,
                      depth=7,                          
                      task_type='CPU', eval_metric='TotalF1',
                      min_data_in_leaf = 32,
                      #max_bin = 220,
                      #verbose=100,
                      #leaf_estimation_iterations=30,
                      #od_wait=500,
                      grow_policy='Lossguide',
                      #learning_rate=0.05,
                      #bootstrap_type='Poisson'
                      verbose=10
                      )

reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model = True, plot = False,)

In [None]:
from sklearn import preprocessing
import seaborn as sns
def plot_importance(feature_importance):
    important_features = feature_importance[feature_importance > 0].sort_values(ascending=False)
    
    plt.figure(figsize=(15, len(important_features)//4))
    sns.barplot(x=important_features.values, y=important_features.index, orient='h')
    plt.title('Top important features')
    plt.show()

tfidf_pipe = Pipeline([('vecrotizing', tf)])


feature_importance_1 = pd.Series(reg.get_feature_importance(), index=tfidf_pipe.steps[0][0].get_feature_names_out())
plot_importance(feature_importance_1)

In [None]:
y_train.astype(int)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].detach().cpu()
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def make_features_transformers(df, model_name, df_model, col, max_len):
  tokenizer = AutoTokenizer.from_pretrained(model_name,  do_lower_case=True)
  model = AutoModel.from_pretrained(model_name).cuda()
  text_features = []
  for sentence in tqdm(df[col]):
    encoded_input = tokenizer([sentence], padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    with torch.no_grad():
      model_output = model(input_ids=encoded_input['input_ids'].cuda())
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).numpy())
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df
     


In [None]:
train = pd.read_csv('/content/df_for_train.csv', encoding='utf-8') 
train  = train.dropna()

import string 
stopwords = nltk.corpus.stopwords.words('russian')
stopwords.extend(['настоящий договор', 'настоящего договора', 'настоящим договором', 'настоящему договору',\
                  'настоящего', 'договора', 'настоящим', 'договором ', 'договорy', 'к', 'на', '...'])

def del_stopwors_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    #remove links starting with http
    text = re.sub(r'http\S+', ' ', text)
    #remove digits
    text = re.sub(r'\d+',' ', text)
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

train['text_Clean'] = train['text'].apply(lambda x: del_stopwors_text(x))

In [None]:
train.head()

In [None]:
corpus_data = []
for i in range(len(train['text_Clean'])):
  corpus_data.append(' '.join(train['text_Clean'].values[i]))
train['text_Clean'] = corpus_data

In [None]:
models = ['DeepPavlov/rubert-base-cased-conversational']
for m in models:
  print(m)
  train = train.join(make_features_transformers(train, m, m.split('/')[1], 'text', 256))
  train.to_csv('rbk_transformers_features.csv', index=False)

DeepPavlov/rubert-base-cased-conversational


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/116 [00:00<?, ?it/s]

In [None]:
train.head(3)

Unnamed: 0,name,text,keywords,type,text_Clean,rubert-base-cased-conversational_text_feature_0,rubert-base-cased-conversational_text_feature_1,rubert-base-cased-conversational_text_feature_2,rubert-base-cased-conversational_text_feature_3,rubert-base-cased-conversational_text_feature_4,...,rubert-base-cased-conversational_text_feature_758,rubert-base-cased-conversational_text_feature_759,rubert-base-cased-conversational_text_feature_760,rubert-base-cased-conversational_text_feature_761,rubert-base-cased-conversational_text_feature_762,rubert-base-cased-conversational_text_feature_763,rubert-base-cased-conversational_text_feature_764,rubert-base-cased-conversational_text_feature_765,rubert-base-cased-conversational_text_feature_766,rubert-base-cased-conversational_text_feature_767
0,855c8450351c98a25ea2ac22a0256c7f,г договор г москва г нижеподписавшиеся граждан...,аренды квартиры договора аренды получает време...,4,"[г, договор, г, москва, г, нижеподписавшиеся, ...",-0.336127,-0.492062,0.388523,-0.091527,0.059756,...,-0.404316,-0.121761,0.071768,-0.773399,0.604747,-0.056796,0.007781,0.301086,0.245351,-0.358273
1,35b6a0f57d909507c5aa9a8972b15f35,приложение приказу ав договор возмездного оказ...,оказания услуг настоящего договора оказать усл...,2,"[приложение, приказу, ав, договор, возмездного...",-0.445444,-0.710986,0.515844,0.316864,-0.08741,...,-0.02213,0.343179,0.4788,-0.982635,0.702125,-0.110526,0.201609,0.371411,0.434384,-0.393936
2,6babd660f1a9192017a06b7e93bea919,договор г москва общество ограниченной ответст...,настоящего договора настоящему договору сдачи ...,3,"[договор, г, москва, общество, ограниченной, о...",-0.36739,-0.380517,0.190718,-0.070568,-0.064334,...,-0.402182,-0.163525,-0.011978,-0.972786,0.667193,0.084245,0.007796,0.467209,0.317579,-0.419649


In [None]:
target_features = ['name','text','type','keywords','text_Clean']
y = train['type']
X = train.drop(columns=target_features)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [None]:
good_ft = None


cls = CatBoostClassifier(iterations = 300,
                      loss_function='MultiClass',
                      l2_leaf_reg=1,
                      learning_rate=0.5,
                      depth=7,                          
                      task_type='CPU', eval_metric='TotalF1',
                      min_data_in_leaf = 32,
                      #max_bin = 220,
                      #verbose=100,
                      #leaf_estimation_iterations=30,
                      #od_wait=500,
                      grow_policy='Lossguide',
                      #learning_rate=0.05,
                      #bootstrap_type='Poisson'
                      verbose=10
                      )

cls.fit(X_train, y_train, use_best_model = True, plot = False,)
ftt = cls.get_feature_importance(prettified=True)
good_ft = ftt['Feature Id'][ftt['Importances']>0.7]

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.5532401	total: 28.6ms	remaining: 8.54s
10:	learn: 0.9903454	total: 510ms	remaining: 13.4s
20:	learn: 0.9903454	total: 1.02s	remaining: 13.6s
30:	learn: 0.9904136	total: 1.65s	remaining: 14.3s
40:	learn: 0.9903454	total: 2.2s	remaining: 13.9s
50:	learn: 0.9904136	total: 2.73s	remaining: 13.3s
60:	learn: 0.9904136	total: 3.25s	remaining: 12.7s
70:	learn: 0.9903454	total: 3.82s	remaining: 12.3s
80:	learn: 0.9903454	total: 4.42s	remaining: 12s
90:	learn: 0.9903454	total: 4.91s	remaining: 11.3s
100:	learn: 0.9903454	total: 5.42s	remaining: 10.7s
110:	learn: 0.9904136	total: 5.87s	remaining: 9.99s
120:	learn: 0.9904136	total: 6.57s	remaining: 9.72s
130:	learn: 0.9904136	total: 7.02s	remaining: 9.06s
140:	learn: 0.9904136	total: 7.61s	remaining: 8.58s
150:	learn: 0.9904136	total: 8.1s	remaining: 7.99s
160:	learn: 0.9904136	total: 8.72s	remaining: 7.53s
170:	learn: 0.9904136	total: 9.3s	remaining: 7.01s
180:	learn: 0.9904136	total: 9.78s	remaining: 6.43s
190:	learn: 0.9904136	total

In [None]:
cls2 = CatBoostClassifier(iterations = 200,
                      loss_function='MultiClass',
                      l2_leaf_reg=0.5,
                      learning_rate=0.05,
                      depth=12,                          
                      task_type='CPU', eval_metric='TotalF1',
                      min_data_in_leaf = 32,
                      #max_bin = 220,
                      #verbose=100,
                      #leaf_estimation_iterations=30,
                      #od_wait=500,
                      grow_policy='Lossguide',
                      #learning_rate=0.05,
                      #bootstrap_type='Poisson'
                      verbose=10
                      )

cls2.fit(X_train[good_ft], y_train, use_best_model = True, plot = False,)


You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.3819937	total: 1.68ms	remaining: 334ms
10:	learn: 0.8830143	total: 23.6ms	remaining: 406ms
20:	learn: 0.9517973	total: 44.2ms	remaining: 376ms
30:	learn: 0.9516632	total: 64ms	remaining: 349ms
40:	learn: 0.9614063	total: 84.6ms	remaining: 328ms
50:	learn: 0.9614063	total: 106ms	remaining: 309ms
60:	learn: 0.9807367	total: 126ms	remaining: 286ms
70:	learn: 0.9807367	total: 146ms	remaining: 264ms
80:	learn: 0.9807367	total: 185ms	remaining: 271ms
90:	learn: 0.9903454	total: 237ms	remaining: 284ms
100:	learn: 0.9903454	total: 350ms	remaining: 343ms
110:	learn: 0.9903454	total: 404ms	remaining: 324ms
120:	learn: 0.9903454	total: 498ms	remaining: 325ms
130:	learn: 0.9903454	total: 577ms	remaining: 304ms
140:	learn: 0.9903454	total: 617ms	remaining: 258ms
150:	learn: 0.9903454	total: 649ms	remaining: 210ms
160:	learn: 0.9903454	total: 686ms	remaining: 166ms
170:	learn: 0.9903454	total: 722ms	remaining: 122ms
180:	learn: 0.9903454	total: 803ms	remaining: 84.3ms
190:	learn: 0.99034

<catboost.core.CatBoostClassifier at 0x7ff549b781f0>

In [None]:
from sklearn.metrics import classification_report
pred  = cls2.predict(X_test)
res_1 = classification_report(pred, y_test)
print(res_1)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       0.50      0.50      0.50         2
           5       0.67      0.67      0.67         3

    accuracy                           0.83        12
   macro avg       0.83      0.83      0.83        12
weighted avg       0.83      0.83      0.83        12



In [None]:
cls2.save_model('model_test.cbm',
           format="cbm",
           export_parameters=None,
           pool=None)

In [None]:
import pickle