In [7]:
#!L
import argparse
import os
import sys

PROJ_PATH = "../.."
SRC_PATH = os.path.join(PROJ_PATH, "intent_complex_model")
sys.path.append(SRC_PATH)

import re
from collections import OrderedDict
from itertools import chain
import logging
import math
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')
import random

import compress_fasttext
import spacy
import h5py
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm
from sklearn.preprocessing import LabelBinarizer

import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from src.data.text_augmentation import augment_df

In [2]:
#!L
pd.__version__

'1.2.4'

In [3]:
#!L
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [4]:
DATA_PATH = os.path.join(SRC_PATH, "Data")
RAW_PATH = os.path.join(DATA_PATH, "raw")
PROCESSED_PATH = os.path.join(DATA_PATH, "processed")
INTERIM_PATH = os.path.join(DATA_PATH, "interim")
MODELS_PATH = os.path.join(SRC_PATH, "models")
# Pretrained FastText embeddings can be downloaded from
# http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin
FASTTEXT_PATH = os.path.join(MODELS_PATH, 'compressed_ft_native_300_ru_twitter_nltk_word_tokenize.bin')

In [69]:
#!L
# train = pd.read_csv(os.path.join(PROCESSED_PATH, 'intents_train_context.csv'))
train = pd.read_excel(os.path.join(RAW_PATH, 'gipoteza_1407.xlsx'))[['phrases', 'intent']]
train.columns = ['text', 'intent']
train = train[~train.intent.apply(lambda x: 'default_agreement_faq' in x)]
train = train[train.intent.apply(lambda x: 'cntxt' not in x)]


valid = pd.read_csv(os.path.join(PROCESSED_PATH, 'intents_valid.csv'))
test = pd.read_csv(os.path.join(PROCESSED_PATH, 'intents_test.csv'))
test = test[test.intent.isin(train.intent.unique())]
train = train[~train.text.isin(valid.text)]
train = train[~train.text.isin(test.text)]
# train.drop_duplicates(inplace=True)
train = train.reset_index(drop=True)
train.head(2)

Unnamed: 0,text,intent
0,не помогает,default_problem_tech
1,не приходит,default_problem_tech


In [70]:
changes = {
    'employee_transfer': 'support_chat_faq',
    'default_problem_tech': 'default_error_tech'
}
train['intent'] = train.intent.replace(changes)
test['intent'] = test.intent.replace(changes)
valid['intent'] = valid.intent.replace(changes)

In [71]:
train.intent.value_counts()[-20:]

credit_card_grace_cashtrans_faq     294
service_close_faq                   275
account_cashorder_faq               271
account_avbalance_faq               268
account_notenter_faq                249
default_moneycheeting_tech          241
card_hasabonuses_info               227
account_brokerage_transfer_faq      219
credit_card_installment_faq         211
card_debit_prioritypass_tech        205
default_greeting_faq                193
card_cashback_metro_faq             182
credit_loanrefusal_faq              146
insurance_contractnumber_tech       105
card_cobrand_miles_term_faq          80
default_unsuccessfultransfer_faq     71
default_error_tech                   61
default_registration_office_faq      33
default_problem_transfer_tech        31
default_questionbot_faq               8
Name: intent, dtype: int64

In [72]:
#!L
very_small_intents = list(train.intent.value_counts()[train.intent.value_counts()<40].index)
med_small_intents = list(train.intent.value_counts()[(train.intent.value_counts()>=40)&(train.intent.value_counts()<150)].index)
small_intents = list(train.intent.value_counts()[(train.intent.value_counts()>=150)&(train.intent.value_counts()<300)].index)
big_intents = list(train.intent.value_counts()[(train.intent.value_counts()>=300)].index)
print(len(very_small_intents), len(med_small_intents), len(small_intents), len(big_intents))

print(time.ctime())
vs = augment_df(train[train.intent.isin(very_small_intents)], frac=15, bert_path=None)
print(time.ctime())
ms = augment_df(train[train.intent.isin(med_small_intents)], frac=2.5, bert_path=None)
print(time.ctime())
ms = augment_df(ms, frac=2, bert_path=None)
print(time.ctime())
s = augment_df(train[train.intent.isin(small_intents)], frac=3, bert_path=None)
print(time.ctime())
b = train[train.intent.isin(big_intents)].copy()

train = pd.concat([vs, ms, s, b])
train.drop_duplicates(inplace=True)

3 5 13 80
Fri Nov 12 09:16:58 2021
Fri Nov 12 09:16:58 2021
Fri Nov 12 09:16:58 2021
Fri Nov 12 09:16:58 2021
Fri Nov 12 09:16:58 2021


In [73]:
train.shape, train.intent.nunique(), test.intent.nunique(), valid.intent.nunique()

((86528, 2), 101, 92, 92)

In [74]:
ft = compress_fasttext.models.CompressedFastTextKeyedVectors.load(FASTTEXT_PATH)
EMBED_DIM = ft.vector_size

In [75]:
MAX_LEN = max(train.text.apply(lambda x: len(word_tokenize(x))))
print(MAX_LEN)

def get_ft_embedding(txt, ft=ft, max_len=MAX_LEN):
    tokens = word_tokenize(txt)
    
    if len(tokens) > MAX_LEN:
        tokens = tokens[:MAX_LEN]
    
    else:
        tokens = ['<PAD>'] * (max_len - len(tokens)) + tokens
    
    return np.stack(map(ft.get_vector, tokens))

57


In [76]:
#!L
def evaluate(data, model, criterion, device, batch_size, best_f1=False):
    """
    Evaluation, return accuracy and loss
    """
    total_loss = 0.
    y_true = []
    y_pred = []

    model.eval()  # Set mode to evaluation to disable dropout & freeze BN
    data_loader = DataLoader(data, batch_size=batch_size)
    sigmoid = nn.Sigmoid()
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = model(x_batch)
            total_loss += criterion(output, y_batch)
            y_pred.extend(sigmoid(output).cpu().numpy())  # don't forget to execute sigmoid function on logits
            y_true.extend(y_batch.cpu().numpy())
    y_true = np.asarray(y_true, dtype=np.uint8)
    y_pred = np.asarray(y_pred)
    # finding the best threshold with highest f1 score
    if best_f1:
        thresholds = np.linspace(0.2, 1, 80)
        f1s = [skm.f1_score(np.argmax(y_true, axis=1), np.array(y_pred > thr, dtype=np.uint8), average='macro') for thr in thresholds]
        best_index = np.argmax(f1s)
        return {
            'f1': f1s[best_index],
            'threshold': thresholds[best_index],
            'loss': total_loss / len(data)
        }
    else:
        accuracy = skm.f1_score(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), average='micro')
        f1 = skm.f1_score(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), average='macro', zero_division=1)
        f1_val = skm.f1_score(
            np.argmax(y_true, axis=1),
            np.argmax(y_pred, axis=1),
            average='macro',
            zero_division=1,
            labels=lb.transform(valid.intent.unique()).argmax(axis=1),
        )
        return {
            'accuracy': accuracy,
            'f1': f1,
            'f1_val': f1_val,
            'loss': total_loss / len(data)
        }

In [77]:
#!L
def predict_proba(data, model, device, batch_size):
    """
    Prediction, return numpy matrix of predictions (batch_size * n_classes)
    """
    y_pred = []

    model.eval()  # Set mode to evaluation to disable dropout & freeze BN
    data_loader = DataLoader(data, batch_size=batch_size)
    softmax = nn.Softmax()
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(device)
            output = model(x_batch)
            y_pred.extend(softmax(output).cpu().numpy())  # don't forget to execute sigmoid function on logits
    y_pred = np.asarray(y_pred)
    return y_pred

In [78]:
#!L
class ModuleParallel(nn.Module):
    """
    Execute multiple modules on the same input and concatenate the results
    """
    def __init__(self, modules: list, axis=1):
        super().__init__()
        self.modules_ = nn.ModuleList(modules)
        self.axis = axis

    def forward(self, input):
        return torch.cat([m(input) for m in self.modules_], self.axis)


class GlobalMaxPooling(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.max(2)[0]

In [79]:
#!L
class EarlyStopping:
    """
    Identify whether metric has not been improved for certain number of epochs
    """

    def __init__(self,
                 mode: str = 'min',
                 min_delta: float = 0,
                 patience: int = 20):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience

        self.is_better = None
        if patience == 0:
            self.is_better = lambda *_: True
        else:
            self._init_is_better(mode, min_delta)

        self.best = None
        self.num_bad_epochs = 0

    def step(self, current) -> bool:
        """
        Make decision whether to stop training

        :param current: new metric value
        :return: whether to stop
        """
        if isinstance(current, torch.Tensor):
            current = current.cpu()
        if np.isnan(current):
            return True

        if self.best is None:
            self.best = current
        else:
            if self.is_better(current, self.best):
                self.num_bad_epochs = 0
                self.best = current
            else:
                self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            return True
        else:
            return False

    def _init_is_better(self, mode, min_delta):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if mode == 'min':
            self.is_better = lambda value, best: value < best - min_delta
        if mode == 'max':
            self.is_better = lambda value, best: value > best + min_delta

In [80]:
#!L
class CNNTextClassifier(nn.Module):
    """
    CNN-based text classifier

    It can be used for both multi-class and multi-label classification problem,
     because loss is not specified
    """

    def __init__(self,
                 num_classes,
                 embed_dim=EMBED_DIM,
                 filters=(600,),
                 kernel_sizes=(4,),
                 pooling_dropout=0.8,
                 dense_sizes=(1000,),
                 dense_dropout=0.8,
                 **kwargs):
        """
        :param num_classes: number of outputs (classes)
        :param word_to_id: dictionary used to compose lookup table
        :param use_pretrained_word_vectors: whether to use pre-trained word vectors
        :param word_vectors_path: path to word vectors file (should be in compatible format)
        :param trainable_word_vectors: whether to train (change) vectors
        :param embed_dim: embedding dimensionality in case of `use_pretrained_word_vectors=False`
        :param filters: number of filters (output channels) for each kernel size of the 1st CNN layer
        :param kernel_sizes: kernel sizes of the 1st CNN layer
        :param pooling_dropout: dropout coefficient after pooling layer
        :param dense_sizes: sizes of fully-connected layers
        :param dense_dropout: dropout coefficient after each fully-connected layer
        :param kwargs: ignored arguments
        """
        super().__init__()
    
        self.convs0 = ModuleParallel([
            nn.Sequential(OrderedDict([
                ('conv0_{}'.format(k), nn.Conv1d(embed_dim, f, k)),
                ('conv0_{}_bn'.format(k), nn.BatchNorm1d(f)),
                ('conv0_{}_relu'.format(k), nn.ReLU()),
                ('conv0_{}_pool'.format(k), GlobalMaxPooling()),
                ('conv0_{}_dp'.format(k), nn.Dropout(pooling_dropout)),
            ]))
            for k, f in zip(kernel_sizes, filters)
        ])

        dense_sizes_in = [sum(filters)] + list(dense_sizes)[:-1]
        self.fcs = nn.Sequential(OrderedDict(chain(*[
            [
                ('fc{}'.format(i), nn.Linear(dense_sizes_in[i], dense_sizes[i])),
                ('fc{}_bn'.format(i), nn.BatchNorm1d(dense_sizes[i])),
                ('fc{}_relu'.format(i), nn.ReLU(inplace=True)),
                ('fc{}_dp'.format(i), nn.Dropout(dense_dropout))
            ] for i in range(len(dense_sizes))
        ])))
        self.fc_last = nn.Linear(dense_sizes[-1], num_classes)

    def forward(self, x):
        # Conv1d takes in (batch, channels, seq_len), but raw embedded is (batch, seq_len, channels)
        x = self.convs0(x.permute(0, 2, 1))
        x = self.fcs(x)
        x = self.fc_last(x)
        return x

In [81]:
#!L
all_labels = sorted(train.intent.unique())

lb = LabelBinarizer()
lb.classes_ = all_labels
y_train = lb.transform(train["intent"].values)
y_val = lb.transform(valid["intent"].values)
y_test = lb.transform(test["intent"].values)

print("Number of labels: {}".format(len(all_labels)))
print("Train dataset size: {}".format(y_train.shape[0]))
print("Test dataset size: {}".format(y_test.shape[0]))
print("{}/{} - train/validation split".format(y_train.shape[0], y_val.shape[0]))

Number of labels: 101
Train dataset size: 86528
Test dataset size: 6388
86528/6501 - train/validation split


In [82]:
%%time
X_train_encoded = np.stack(train["text"].apply(get_ft_embedding))
X_val_encoded = np.stack(valid["text"].apply(get_ft_embedding))
X_test_encoded = np.stack(test["text"].apply(get_ft_embedding))

CPU times: user 3min 2s, sys: 1.8 s, total: 3min 4s
Wall time: 3min 4s


In [83]:
X_train_encoded.shape

(86528, 57, 100)

In [84]:
train_data = TensorDataset(torch.FloatTensor(X_train_encoded), torch.FloatTensor(y_train))
val_data = TensorDataset(torch.FloatTensor(X_val_encoded), torch.FloatTensor(y_val))
test_data = TensorDataset(torch.FloatTensor(X_test_encoded), torch.FloatTensor(y_test))

In [97]:
device = torch.device('cuda')
batch_size = 128

In [209]:
#!L
model = CNNTextClassifier(
        num_classes=y_train.shape[1],
        kernel_sizes=[1, 2, 3, 4],
        filters=[800, 800, 800, 800],
        dense_sizes=[2000, 800],
        pooling_dropout=0.45,
        dense_dropout=0.45,
        trainable_word_vectors=False
    ).to(device)

In [210]:
#!L
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 8,898,901 trainable parameters


In [211]:
#!L
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
                        lr=0.00035)
criterion = nn.BCEWithLogitsLoss(reduction='sum')   # sigmoid
# criterion = nn.CrossEntropyLoss(reduction='sum')  # softmax

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer, 
#     patience=4,
#     verbose=True, 
#     factor=0.15
# )
scheduler  = optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.00035, max_lr=0.015, mode='triangular2', cycle_momentum=False)
early_stopping = EarlyStopping(mode='max', patience=10)

In [212]:
#!L
best_valid_f1 = 0

In [213]:
#!L
for epoch in range(70):
    model.train()
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)

    for idx, (x_batch, y_batch) in enumerate(train_loader):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        loss = criterion(model(x_batch), y_batch)
        loss.backward()

        # clipping gradients
        torch.nn.utils.clip_grad_norm_([p for p in model.parameters() if p.requires_grad], 1)

        optimizer.step()

    train_metrics = evaluate(train_data, model, criterion, device, batch_size)
    val_metrics = evaluate(val_data, model, criterion, device, batch_size)
    
    if val_metrics['f1_val'] > best_valid_f1:
        best_valid_f1 = val_metrics['f1_val']
        torch.save(model.state_dict(), 'tmp_cnn_st_dict.pt')
    
    scheduler.step(val_metrics['f1_val'])
    
    print('Epoch {:3}, {}, {}, {}'
                .format(epoch + 1, time.ctime(), ' '.join(['train_{}: {:<6.4f}'.format(k, v) for k, v in train_metrics.items()]),
                        ' '.join(['val_{}: {:<6.4f}'.format(k, v) for k, v in val_metrics.items()])))

    if early_stopping.step(val_metrics['f1_val']):
        break

model.load_state_dict(torch.load('tmp_cnn_st_dict.pt'))
        
model.eval()
test_metrics = evaluate(val_data, model, criterion, device, batch_size)
print('Finally: {}'.format(' '.join(['test_{}: {:<6.4f}'.format(k, v) for k, v in test_metrics.items()])))

Epoch   1, Fri Nov 12 15:19:22 2021, train_accuracy: 0.8576 train_f1: 0.8190 train_f1_val: 0.8139 train_loss: 1.0344, val_accuracy: 0.8519 val_f1: 0.7239 val_f1_val: 0.7947 val_loss: 1.0397
Epoch   2, Fri Nov 12 15:19:35 2021, train_accuracy: 0.9057 train_f1: 0.8895 train_f1_val: 0.8861 train_loss: 0.7182, val_accuracy: 0.8863 val_f1: 0.7848 val_f1_val: 0.8445 val_loss: 0.8148
Epoch   3, Fri Nov 12 15:19:48 2021, train_accuracy: 0.9295 train_f1: 0.9196 train_f1_val: 0.9168 train_loss: 0.5401, val_accuracy: 0.8977 val_f1: 0.8017 val_f1_val: 0.8627 val_loss: 0.7135
Epoch   4, Fri Nov 12 15:20:01 2021, train_accuracy: 0.9426 train_f1: 0.9360 train_f1_val: 0.9341 train_loss: 0.4490, val_accuracy: 0.9052 val_f1: 0.8157 val_f1_val: 0.8777 val_loss: 0.6426
Epoch   5, Fri Nov 12 15:20:13 2021, train_accuracy: 0.9528 train_f1: 0.9482 train_f1_val: 0.9469 train_loss: 0.3994, val_accuracy: 0.9088 val_f1: 0.8281 val_f1_val: 0.8911 val_loss: 0.6353
Epoch   6, Fri Nov 12 15:20:25 2021, train_accurac

In [None]:
val_accuracy: 0.9265 val_f1_val: 0.9001
val_accuracy: 0.9209 val_f1: 0.8237
al_accuracy: 0.9180 val_f1: 0.8359

In [81]:
#!L
model.eval()
val_metrics = evaluate(val_data, model, criterion, device, batch_size)
print('{}'.format('\n'.join(['Val {}: {:<6.4f}'.format(k, v) for k, v in val_metrics.items()])))

Val accuracy: 0.8394
Val f1: 0.6809
Val f1_val: 0.7327
Val loss: 1.1584


In [26]:
with pd.ExcelWriter('100821_CNN_scoring.xlsx') as writer:
    pd.DataFrame(predict_proba(train_data, model, device, batch_size)).to_excel(writer, index=None, sheet_name='train_scores')
    train.to_excel(writer, sheet_name='train_data', index=None)
    
    pd.DataFrame(predict_proba(val_data, model, device, batch_size)).to_excel(writer, index=None, sheet_name='val_scores')
    valid.to_excel(writer, sheet_name='val_data', index=None)
    
    pd.DataFrame(predict_proba(test_data, model, device, batch_size)).to_excel(writer, index=None, sheet_name='test_scores')
    test.to_excel(writer, sheet_name='test_data', index=None)
    
    pd.Series(lb.classes_).to_excel(writer, sheet_name='classes_', index=None)

In [77]:
#!L
# saving model
torch.save(model.state_dict(), os.path.join(MODELS_PATH, '4_intents_cnn_{}.pt'.format(len(lb.classes_))))
with open(os.path.join(MODELS_PATH, 'intents_{}.txt'.format(len(lb.classes_))), 'w') as f:
    f.write(';'.join(lb.classes_))

In [65]:
#!L
# loading model
model = CNNTextClassifier(
        y_train.shape[1],
        kernel_sizes=[1, 3],
        filters=[600, 600],
        dense_sizes=[2000, 800],
        pooling_dropout=0.5,
        dense_dropout=0.5,
        trainable_word_vectors=False
    ).to(device)
model.load_state_dict(torch.load(os.path.join(MODELS_PATH, '2_intents_cnn_{}.pt'.format(len(lb.classes_))), map_location='cpu'))

<All keys matched successfully>

In [53]:
#!L
pred_probas_val = predict_proba(val_data, model, device, batch_size)
preds_val = np.argmax(pred_probas_val, axis=1)
print(
    "Val top-3 accuracy: {}".format(skm.top_k_accuracy_score(np.argmax(y_val, axis=1), pred_probas_val, k=3, labels=range(len(lb.classes_)))),
    "Val top-5 accuracy: {}".format(skm.top_k_accuracy_score(np.argmax(y_val, axis=1), pred_probas_val, k=5, labels=range(len(lb.classes_)))),
    'Val F1-micro/Accuracy: {}'.format(skm.f1_score(np.argmax(y_val, axis=1), preds_val, average='micro')),
    'Val F1-macro: {}'.format(skm.f1_score(np.argmax(y_val, axis=1), preds_val, average='macro')),
    'Val F1-weighted: {}'.format(skm.f1_score(np.argmax(y_val, axis=1), preds_val, average='weighted')),
    'Val precision-macro: {}'.format(skm.precision_score(np.argmax(y_val, axis=1), preds_val, average='macro')),
    'Val recall-macro: {}'.format(skm.recall_score(np.argmax(y_val, axis=1), preds_val, average='macro')),
    'Val precision-weighted: {}'.format(skm.precision_score(np.argmax(y_val, axis=1), preds_val, average='weighted')),
    'Val recall-weighted: {}'.format(skm.recall_score(np.argmax(y_val, axis=1), preds_val, average='weighted')),
    sep='\n'
)
pred_probas = predict_proba(test_data, model, device, batch_size)
preds = np.argmax(pred_probas, axis=1)
print(
    "Test top-3 accuracy: {}".format(skm.top_k_accuracy_score(np.argmax(y_test, axis=1), pred_probas, k=3, labels=range(len(lb.classes_)))),
    "Test top-5 accuracy: {}".format(skm.top_k_accuracy_score(np.argmax(y_test, axis=1), pred_probas, k=5, labels=range(len(lb.classes_)))),
    'Test F1-micro/Accuracy: {}'.format(skm.f1_score(np.argmax(y_test, axis=1), preds, average='micro')),
    'Test F1-macro: {}'.format(skm.f1_score(np.argmax(y_test, axis=1), preds, average='macro')),
    'Test F1-weighted: {}'.format(skm.f1_score(np.argmax(y_test, axis=1), preds, average='weighted')),
    'Test precision-macro: {}'.format(skm.precision_score(np.argmax(y_test, axis=1), preds, average='macro')),
    'Test recall-macro: {}'.format(skm.recall_score(np.argmax(y_test, axis=1), preds, average='macro')),
    'Test precision-weighted: {}'.format(skm.precision_score(np.argmax(y_test, axis=1), preds, average='weighted')),
    'Test recall-weighted: {}'.format(skm.recall_score(np.argmax(y_test, axis=1), preds, average='weighted')),
    sep='\n'
)

Val top-3 accuracy: 0.9816844082654979
Val top-5 accuracy: 0.9904508453350032
Val F1-micro/Accuracy: 0.9106136505948653
Val F1-macro: 0.8195158723932594
Val F1-weighted: 0.91082959937294
Val precision-macro: 0.8163827635551293
Val recall-macro: 0.8289665895200922
Val precision-weighted: 0.9160887854826631
Val recall-weighted: 0.9106136505948653


ValueError: Number of classes in 'y_true' (94) not equal to the number of classes in 'y_score' (103).

In [54]:
#!L
#convert labels to intents
intents_true = list(map(lambda x: lb.classes_[x], np.argmax(y_val, axis=1)))
intents_pred = list(map(lambda x: lb.classes_[x], preds_val))

In [58]:
print(skm.classification_report(intents_true, intents_pred, #labels=list(set(intents_true))))
                                labels=['default_thanks_faq',
                                        'support_chat_faq',
                                        'card_operations_faq',
                                        'account_fees_faq',
                                        'credit_card_decrlimit_faq',
                                        'account_transtime_faq',
                                        'default_qestion_faq',
                                        'default_fraudcalls_faq',
                                        'support_number_faq',
                                        'credit_card_grace_info']
                               ))

                           precision    recall  f1-score   support

       default_thanks_faq       0.95      1.00      0.98        79
         support_chat_faq       0.60      0.73      0.66        45
      card_operations_faq       0.96      0.94      0.95       176
         account_fees_faq       0.98      0.78      0.87       317
credit_card_decrlimit_faq       0.98      0.99      0.99       193
    account_transtime_faq       0.83      0.96      0.89       125
      default_qestion_faq       0.83      0.65      0.73        46
   default_fraudcalls_faq       0.89      0.99      0.94        69
       support_number_faq       0.93      0.96      0.95       117
   credit_card_grace_info       0.96      0.92      0.94       238

                micro avg       0.93      0.90      0.91      1405
                macro avg       0.89      0.89      0.89      1405
             weighted avg       0.93      0.90      0.91      1405



In [63]:
train[train.intent=='default_thanks_faq'].sample(50)

Unnamed: 0,text,intent
53121,ясно всего хороего,default_thanks_faq
79814,поняла вас. спасибо за информацию. всего доброго!,default_thanks_faq
53186,а ок\nспасибо \nпоняла,default_thanks_faq
53258,спасибо вам за помощь,default_thanks_faq
79905,"все, поняла. спасибо. если до xx дней, то уже ...",default_thanks_faq
52722,аа понял,default_thanks_faq
79511,не работает рекомендация,default_thanks_faq
52807,спасибо 🙏🏻,default_thanks_faq
79682,"отлично, огромное спасибо вам!)))",default_thanks_faq
79650,спасибо за оперативность. и вам,default_thanks_faq


In [68]:
#!L
clf_report = pd.DataFrame(skm.classification_report(intents_true, intents_pred, output_dict=True)).T\
.drop(['accuracy', 'macro avg', 'weighted avg']).sort_values('support', ascending=False)
print(len(clf_report[clf_report['f1-score']<0.8]))
print(len(clf_report[clf_report['f1-score']<0.9]))

26
80


In [29]:
#!L
clf_report[clf_report['f1-score']<0.9]

Unnamed: 0,precision,recall,f1-score,support
default_problem_faq,0.912088,0.855670,0.882979,194.0
credit_total_debt_faq,0.919540,0.800000,0.855615,100.0
default_limit_info,0.951613,0.808219,0.874074,73.0
credit_channels_faq,0.920635,0.794521,0.852941,73.0
deposit_all_faq,0.804878,0.916667,0.857143,72.0
...,...,...,...,...
demp_credit_card_decrlimit_faq,0.333333,0.500000,0.400000,2.0
default_codeword_faq,0.000000,0.000000,0.000000,1.0
card_whatquestioncashback_faq,0.000000,0.000000,0.000000,1.0
temp_default_passport_update_faq,0.000000,0.000000,0.000000,1.0


In [None]:
#!L
# clf_report['N_samples'] = clf_report.support.values * 4
# clf_report.rename({'support': 'N_test_samples'}, axis=1, inplace=True)
# with pd.ExcelWriter('100221_CNN_clf_report.xlsx') as writer:
#     clf_report.sort_values('N_samples').to_excel(writer, sheet_name='0')