In [2]:
from elastic_db.elasticsearch import elastic_conection, NLPmodelIndex

es = elastic_conection()
index = NLPmodelIndex(es=es, workspace_id="dc1e7b3d-9137-4a20-a99c-d0d2029ef170")
index.index_name

'nlp_model-dc1e7b3d-9137-4a20-a99c-d0d2029ef170-03012021112419'

# Read Data

In [3]:
import pandas as pd
import json

with open("data/intents_dictionary.json", 'r') as f:
      intents_dictionary = json.load(f)

df_train = pd.read_csv("data/train_data.csv", sep="|")
df_test = pd.read_csv("data/test_data.csv", sep="|")

In [4]:
df_train.head()

Unnamed: 0,example,intent
0,Como visualizo o meu holerite?,Solicitar_Holerite
1,Qual a essencia do CoE?,Foco_Cliente
2,Onde encontro informações sobre o processo par...,Alterar_EnquadramentoMerito
3,O Valor depositado do Cartão Alelo veio no Ref...,Valor_Depositado_Vale_RefeicaoAlimentacao_Errado
4,Bom dia,General_Greetings


In [5]:
df_train.shape

(390, 2)

In [6]:
df_test.head()

Unnamed: 0,example,intent
0,quais os assuntos você trata,Bot_Capabilities
1,"preciso trocar uma peça do meu carro, como dev...",Veiculo_Problema
2,"Como modificar a senha do cartão Alelo VR, VA?",Esquecer_Senha_Vale_RefeicaoAlimentacao
3,"Estou com o contra-cheque, mas o pagamento ain...",Contracheque_Recebido_Sem_Constar_Conta
4,"o pont fechou e o sistema está bloqueado, como...",Solicitar_Ponto_Pos_FechamentoReembolso


In [7]:
df_test.shape

(235, 2)

# Get lemmas

In [15]:
import spacy

nlp = spacy.load('pt_core_news_lg')

def keep_token(token):
    return (token.is_alpha and 
            not (token.is_space or token.is_punct or 
                 token.is_stop or token.like_num))

def to_lemas(doc):
    sentence_lemmas = [token.lemma_ for token in doc if keep_token(token)]
    return sentence_lemmas

In [16]:
doc = nlp("Como modificar a senha do cartão Alelo VR, VA?")
to_lemas(doc)

['modificar', 'o', 'senha', 'cartão', 'Alelo', 'VR', 'VA']

In [17]:
train_examples = [to_lemas(nlp(example)) for example in df_train.example]
test_stard_index = len(train_examples)
test_examples = [to_lemas(nlp(example)) for example in df_test.example]
all_examples = train_examples + test_examples

# Embedding Encoding


[UD Portuguese Bosque](https://universaldependencies.org/treebanks/pt_bosque/index.html)

[Creating TF-IDF Weighted Word Embeddings](http://dsgeek.com/2018/02/19/tfidf_vectors.html)

In [22]:
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

docs_dict = Dictionary(all_examples)
#docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

In [24]:
import numpy as np

docs_corpus = [docs_dict.doc2bow(doc) for doc in all_examples]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

In [26]:
docs_vecs.shape

(625, 940)

In [27]:
import spacy

nlp = spacy.load('pt_core_news_lg')

# use spaCy to get the 300 dimensional Glove embedding vector for each TF-IDF term
tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

In [28]:
tfidf_emb_vecs.shape

(940, 300)

In [30]:
# To get a TF-IDF weighted we do matrix multiplication
docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
docs_emb.shape

(625, 300)

In [31]:
X_train  = docs_emb[:test_stard_index]
X_test = docs_emb[test_stard_index:]
print(X_train.shape[0], X_test.shape[0])

390 235


# One Hot Enconding the intents

In [32]:
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder

def intent_to_onehot(intents_names):
    data = asarray([ [intent_name]  for intent_name in intents_names])
    # define one hot encoding
    encoder = OneHotEncoder(sparse=False)
    # transform data
    intents_names_as_onehot = encoder.fit_transform(data)
    return intents_names_as_onehot

In [33]:
intents_name = df_train.intent.tolist() + df_test.intent.tolist()
set_intents_name = set(intents_name)

In [34]:
set_intents_name = set(intents_name)
intents_name_as_onehot = intent_to_onehot(set_intents_name)
dic_onehot_intents = {intent: onehot for intent, onehot in zip(set_intents_name, intents_name_as_onehot)}
dic_onehot_intents["General_Negative_Feedback"]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [35]:
import numpy as np

intent_dictionary = {str(np.argmax(value)):key for key, value in zip(dic_onehot_intents.keys(),
                                            dic_onehot_intents.values())}

In [36]:
import json

with open("data/yarin_intents.json", 'w') as f:
    json.dump(intent_dictionary, f)
    
f.close()

In [39]:
import numpy as np

y_train = [dic_onehot_intents[intent] for intent in df_train.intent]
y_test = [dic_onehot_intents[intent] for intent in df_test.intent]

y_train = np.stack(y_train)
y_test = np.stack(y_test)
print(y_train.shape[0], y_test.shape[0])

390 235


# Saving X and y data

In [40]:
# salvando dados
with open('data/X_train_embedding.npy', 'wb') as f:
    np.save(f, X_train)
    f.close()
    
with open('data/y_train_embedding.npy', 'wb') as f:
    np.save(f, y_train)
    f.close()
    
with open('data/X_test_embedding.npy', 'wb') as f:
    np.save(f, X_test)
    f.close()
    
with open('data/y_test_embedding.npy', 'wb') as f:
    np.save(f, y_test)
    f.close()