In [37]:
from elastic_db.elasticsearch import elastic_conection, NLPmodelIndex

es = elastic_conection()
index = NLPmodelIndex(es=es, workspace_id="dc1e7b3d-9137-4a20-a99c-d0d2029ef170")
index.index_name

'nlp_model-dc1e7b3d-9137-4a20-a99c-d0d2029ef170-03012021112419'

# Read data

In [74]:
import pandas as pd


df_train = pd.read_csv("data/train_data.csv", sep="|")
df_test = pd.read_csv("data/test_data.csv", sep="|")

In [75]:
df_train.head(3)

Unnamed: 0,example,intent
0,Como visualizo o meu holerite?,Solicitar_Holerite
1,Qual a essencia do CoE?,Foco_Cliente
2,Onde encontro informações sobre o processo par...,Alterar_EnquadramentoMerito


In [77]:
df_test.head(3)

Unnamed: 0,example,intent
0,quais os assuntos você trata,Bot_Capabilities
1,"preciso trocar uma peça do meu carro, como dev...",Veiculo_Problema
2,"Como modificar a senha do cartão Alelo VR, VA?",Esquecer_Senha_Vale_RefeicaoAlimentacao


# elastic to prepare text data

In [79]:
import time
import spacy

start_time = time.time()
query = {
  "tokenizer" : "classic",
  "filter" : [
              "lowercase",
              "asciifolding",
              {"type": "stop", "stopwords": "_portuguese_"},
              {"type": "stemmer", "language": "brazilian"}],
  "text" : ""
}

train_examples_text_without_stopwords = []
for train_example in df_train.example:
    query["text"] = train_example
    result = es.indices.analyze(index=index.index_name, body=query)
    new_text = " ".join([token["token"] for token in result["tokens"]])
    train_examples_text_without_stopwords.append(new_text)
    
test_examples_text_without_stopwords = []
for test_example in df_test.example:
    query["text"] = test_example
    result = es.indices.analyze(index=index.index_name, body=query)
    new_text = " ".join([token["token"] for token in result["tokens"]])
    test_examples_text_without_stopwords.append(new_text)
    
end_time = time.time()
print("--- %s seconds ---" % round(time.time() - start_time))
test_examples_text_without_stopwords[:1]

--- 120 seconds ---


['qua assunt voc trat']

# One Hot Enconding the intents

In [80]:
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder

def intent_to_onehot(intents_names):
    data = asarray([ [intent_name]  for intent_name in intents_names])
    # define one hot encoding
    encoder = OneHotEncoder(sparse=False)
    # transform data
    intents_names_as_onehot = encoder.fit_transform(data)
    return intents_names_as_onehot

In [81]:
intents_name = df_train.intent.tolist() + df_test.intent.tolist()
set_intents_name = set(intent_name)

In [82]:
set_intents_name = set(intents_name)
intents_name_as_onehot = intent_to_onehot(set_intents_name)
dic_onehot_intents = {intent: onehot for intent, onehot in zip(set_intents_name, intents_name_as_onehot)}
dic_onehot_intents["General_Negative_Feedback"]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [83]:
import numpy as np

intent_dictionary = {str(np.argmax(value)):key for key, value in zip(dic_onehot_intents.keys(),
                                            dic_onehot_intents.values())}

In [84]:
import json

with open("data/yarin_intents.json", 'w') as f:
    json.dump(intent_dictionary, f)
    
f.close()

# Apply Hash Encoding on Examples

"*The class FeatureHasher is a high-speed, low-memory vectorizer that uses a technique known as feature hashing, or the “hashing trick”. Instead of building a hash table of the features encountered in training, as the vectorizers do, instances of FeatureHasher apply a hash function to the features to determine their column index in sample matrices directly. The result is increased speed and reduced memory usage, at the expense of inspectability; the hasher does not remember what the input features looked like and has no inverse_transform method.*" [6.2.2. Feature hashing](https://scikit-learn.org/stable/modules/feature_extraction.html#feature-hashing)

In [33]:
from sklearn.feature_extraction.text import HashingVectorizer

# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(examples_text_without_stopwords)
# summarize encoded vector
print(vector.shape)
encoded_examples = vector.toarray()
print(encoded_examples)

(625, 20)
[[ 0.         -0.43643578 -0.21821789 ... -0.21821789  0.43643578
   0.        ]
 [-0.37796447 -0.37796447 -0.37796447 ...  0.          0.37796447
   0.        ]
 [ 0.         -0.57735027  0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ... -0.57735027  0.
   0.        ]
 [ 0.          0.          0.         ... -0.70710678  0.
   0.        ]
 [ 0.         -0.57735027  0.57735027 ... -0.57735027  0.
   0.        ]]


In [34]:
y = []
X = encoded_examples
for intent in  intents_name:
    y.append(dic_onehot_intents[intent])

print(len(X), len(y))

625 625


In [35]:
import numpy as np

y = np.stack(y)

# salvando dados
with open('data/X_hash_all_intents.npy', 'wb') as f:
    np.save(f, X)
    f.close()
    
with open('data/y_hash_all_intents.npy', 'wb') as f:
    np.save(f, y)
    f.close()