![](imgs/ensemble.png)

In [1]:
import os

default_path = "/home/cloves/Desktop/omni_nlp_api"
os.chdir(default_path)

In [2]:
from elastic_db.elasticsearch import elastic_conection, NLPmodelIndex

es = elastic_conection()
index = NLPmodelIndex(es=es, workspace_id="dc1e7b3d-9137-4a20-a99c-d0d2029ef170")
index.index_alias

'nlp_model-dc1e7b3d-9137-4a20-a99c-d0d2029ef170'

In [3]:
es.get(index=index.index_name, id=index.workspace_id)

{'_index': 'nlp_model-dc1e7b3d-9137-4a20-a99c-d0d2029ef170-19022021142434',
 '_type': '_doc',
 '_id': 'dc1e7b3d-9137-4a20-a99c-d0d2029ef170',
 '_version': 2,
 '_seq_no': 601,
 '_primary_term': 2,
 'found': True,
 '_source': {'workspace_id': 'dc1e7b3d-9137-4a20-a99c-d0d2029ef170',
  'customer_id': 'yara',
  'recipe': {'model_kind': 'BM25', 'BM25': {'b': 0.7, 'k1': 1.2}},
  'accuracies': {'cnn_accuracy': 0.8092485666275024,
   'bm25_accuracy': 0.7586705202312138,
   'svm_accuracy': 0.7413294797687862,
   'logit_accuracy': 0.5057803468208093}}}

# Ler Modelos

In [4]:
from joblib import load
from tensorflow.keras.models import load_model

path = "/home/cloves/Desktop/omni_nlp_api"

def load_models(workspace_id):
    svm = load(path+'/models/polySVM/trained_models/svm-{workspace_id}.joblib'.format(workspace_id=workspace_id))
    logit = load(path+'/models/multinomialLogit/trained_models/logit-{workspace_id}.joblib'.format(workspace_id=workspace_id))
    cnn = load_model(path+'/models/cnn/trained_models/cnn-{workspace_id}.h5'.format(workspace_id=workspace_id))
    return svm, logit, cnn

svm, logit, cnn = load_models(index.workspace_id)

# Ler Dicionário de Intenções

In [5]:
import json 

file = "/home/cloves/Desktop/omni_nlp_api/feature_encoding/tfidf/resources/intents_dictionary-{workspace_id}.json".format(workspace_id=index.workspace_id)
with open(file, 'r') as f:
      intents_dictionary = json.load(f)

# Texto para tfidf

In [6]:
import numpy as np
import pickle

query = {
  "tokenizer" : "classic",
  "filter" : [
              "lowercase",
              "asciifolding",
              {"type": "stop", "stopwords": "_portuguese_"},
              {"type": "stemmer", "language": "brazilian"}],
  "text" : ""
}

file = "/home/cloves/Desktop/omni_nlp_api/feature_encoding/tfidf/resources/tf_idf_corpus_{workspace_id}.pickle".format(workspace_id=index.workspace_id)
tf_idf_corpus = pickle.load(open(file, "rb"))

def get_sentece_features(sentence, index, es):
    query["text"] = sentence
    result = es.indices.analyze(index=index.index_name, body=query)
    new_sentence = " ".join([token["token"] for token in result["tokens"]])
    sentence_features = tf_idf_corpus.transform([new_sentence]).toarray()
    return sentence_features 

In [8]:
# Duvida_Formularo_Adesao_Previdencia
sentence = "dúvidas sobre o preenchimento do formulário"
sentence_features = get_sentece_features(sentence, index, es)
result_vectors = []
logit_result = logit.predict_proba(sentence_features) * accuracies["logit_accuracy"]
cnn_result = cnn.predict(sentence_features) * accuracies["cnn_accuracy"]
svm_result = svm.predict_proba(sentence_features) * accuracies["svm_accuracy"]
ensemble_result = logit_result + cnn_result + svm_result

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [9]:
intents_dictionary[str(np.argmax(ensemble_result))]

'Duvida_Formularo_Adesao_Previdencia'

In [10]:
np.argmax(ensemble_result)

87

In [11]:
ensemble_result[0][36]

0.002628881604858639

# BM25 result as a np vector

In [12]:
from models.BM25.classifier import find_intents_bm25

result = find_intents_bm25(index, sentence, es)
result

{'intents': [{'intent': 'Duvida_Formularo_Adesao_Previdencia',
   'confidence': 0.9380459315072991},
  {'intent': 'Resgatar_Previdencia', 'confidence': 0.8915409300668932},
  {'intent': 'Treinamento_Juridico', 'confidence': 0.8709480344514546},
  {'intent': 'Fornecedor_ME', 'confidence': 0.8474037735826023},
  {'intent': 'Solicitar_Procuracao', 'confidence': 0.8043615250332146},
  {'intent': 'Cancelar_Previdencia', 'confidence': 0.8016387220257288},
  {'intent': 'Download_Aplicativo_Convenio', 'confidence': 0.8007397338121957},
  {'intent': 'Divisao_Ferias', 'confidence': 0.747114411054499},
  {'intent': 'Agendamento_Alteracao_Ferias', 'confidence': 0.7090204306648935},
  {'intent': 'Duvidas_Despesas', 'confidence': 0.7045770090779644}]}

In [14]:
len(result["intents"])

10

In [50]:
list(map(lambda key: int(key), intents_dictionary.keys()))

[134,
 32,
 54,
 69,
 98,
 8,
 186,
 48,
 60,
 53,
 115,
 79,
 127,
 38,
 111,
 211,
 126,
 146,
 151,
 114,
 123,
 31,
 180,
 192,
 94,
 27,
 2,
 198,
 189,
 193,
 214,
 191,
 44,
 161,
 182,
 57,
 120,
 112,
 52,
 174,
 166,
 137,
 85,
 3,
 46,
 55,
 179,
 213,
 129,
 208,
 138,
 59,
 109,
 90,
 41,
 33,
 65,
 143,
 199,
 178,
 154,
 89,
 164,
 125,
 107,
 173,
 22,
 195,
 142,
 81,
 82,
 87,
 63,
 29,
 58,
 26,
 168,
 88,
 201,
 51,
 144,
 62,
 71,
 70,
 145,
 30,
 17,
 40,
 21,
 47,
 167,
 202,
 207,
 42,
 83,
 171,
 99,
 131,
 210,
 132,
 190,
 153,
 76,
 100,
 68,
 113,
 194,
 175,
 86,
 147,
 72,
 177,
 157,
 9,
 152,
 116,
 150,
 169,
 61,
 165,
 23,
 4,
 66,
 128,
 187,
 176,
 39,
 148,
 24,
 130,
 78,
 103,
 13,
 141,
 73,
 139,
 20,
 209,
 188,
 200,
 7,
 205,
 64,
 10,
 67,
 75,
 77,
 19,
 204,
 50,
 25,
 203,
 156,
 43,
 155,
 105,
 170,
 95,
 16,
 117,
 197,
 80,
 102,
 6,
 37,
 49,
 212,
 158,
 163,
 34,
 36,
 96,
 5,
 56,
 28,
 184,
 149,
 162,
 97,
 91,
 106,
 108,
 13

In [54]:
def bm25_result_to_vector(result, intents_dictionary):
    number_of_intents = max(map(lambda key: int(key), intents_dictionary.keys()))
    intents_dictionary_for_bm25 = {intent:key for key, intent in 
                                   zip(intents_dictionary.keys(), intents_dictionary.values())}
    array_shape = (1, number_of_intents+1)
    array_result_bm25 = np.zeros(array_shape)
    for intent in result["intents"]:
        array_index = int(intents_dictionary_for_bm25[intent["intent"]])
        normalized_score = intent["confidence"]
        array_result_bm25[0][array_index] = normalized_score
    return array_result_bm25

In [55]:
array_result_bm25 = bm25_result_to_vector(result=result, 
                                          intents_dictionary=intents_dictionary)
array_result_bm25

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70902043, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.80163872,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

# Ensemble

In [42]:
nlp_model_doc = es.get(index=index.index_alias, id=index.workspace_id)
accuracies = nlp_model_doc["_source"]["accuracies"]
accuracies

{'cnn_accuracy': 0.8092485666275024,
 'bm25_accuracy': 0.7586705202312138,
 'svm_accuracy': 0.7413294797687862,
 'logit_accuracy': 0.5057803468208093}

In [56]:
# Duvida_Formularo_Adesao_Previdencia
sentence = "dúvidas sobre o preenchimento do formulário"
sentence_features = get_sentece_features(sentence, index, es)
result_vectors = []
bm25_result = find_intents_bm25(index, sentence, es)
array_result_bm25 = bm25_result_to_vector(result=bm25_result, 
                                          intents_dictionary=intents_dictionary) * accuracies["bm25_accuracy"]
logit_result = logit.predict_proba(sentence_features) * accuracies["logit_accuracy"]
cnn_result = cnn.predict(sentence_features) * accuracies["cnn_accuracy"]
svm_result = svm.predict_proba(sentence_features) * accuracies["svm_accuracy"]
ensemble_result = logit_result + cnn_result + svm_result + array_result_bm25

In [57]:
ensemble_result

array([[5.19975406e-03, 1.13980139e-02, 4.61815585e-03, 6.26953292e-03,
        4.45334436e-03, 5.44834514e-03, 4.50384508e-03, 5.75790365e-03,
        6.78145590e-03, 5.04008608e-03, 4.19472830e-03, 3.31201479e-03,
        5.86015330e-03, 6.40587820e-03, 4.83348393e-03, 5.48388352e-01,
        3.73265818e-03, 3.81583563e-03, 8.13814379e-03, 8.78665282e-03,
        1.01197916e-03, 2.89089866e-03, 8.12643834e-03, 1.21406648e-02,
        5.01330328e-03, 4.90918878e-03, 6.11717559e-03, 5.06161988e-03,
        6.45490890e-03, 4.52549339e-03, 5.03889521e-03, 3.66515725e-03,
        4.54734932e-03, 5.15596836e-03, 6.11717153e-03, 2.15784973e-03,
        2.62888160e-03, 3.29366265e-03, 5.71948798e-03, 6.60611417e-03,
        4.59189187e-03, 2.88197209e-03, 5.46305340e-03, 5.76091897e-03,
        4.45141717e-03, 1.28408220e-02, 5.18480308e-03, 4.82225619e-03,
        6.17147015e-03, 6.19017032e-01, 5.11600911e-03, 3.57121551e-03,
        1.11038946e-02, 8.20382489e-03, 3.42832610e-03, 3.225734

In [58]:
intents_dictionary[str(np.argmax(ensemble_result))]

'Duvida_Formularo_Adesao_Previdencia'