In [1]:
from elastic_db.elasticsearch import elastic_conection, NLPmodelIndex

es = elastic_conection()
index = NLPmodelIndex(es=es, workspace_id="dc1e7b3d-9137-4a20-a99c-d0d2029ef170-cedae")
index.index_name

'nlp_model-dc1e7b3d-9137-4a20-a99c-d0d2029ef170-cedae-02022021092147'

In [2]:
index.workspace_id

'dc1e7b3d-9137-4a20-a99c-d0d2029ef170-cedae'

# Read Data

In [2]:
import pandas as pd


df_train = pd.read_csv("data/train_data_cedae.csv", sep=",")
df_test = pd.read_csv("data/test_data_cedae.csv", sep=",")

In [3]:
df_train.head(3)

Unnamed: 0,example,intent
0,A água está barrenta,Reportar_QualidadeDeAgua
1,quero meu extrato de pagamento dos últimos meses,Consultar_ContasPagas
2,Contaminação da agua,Reportar_QualidadeDeAgua


In [4]:
df_test.head(3)

Unnamed: 0,example,intent
0,quero ir numa agência da CEDAE,Localizar_AgenciasCEDAE
1,Boa Noite,General_Greetings
2,quero um relatório dos meus pagamentos,Consultar_ContasPagas


In [5]:
train_examples = df_train.example.tolist()
stard_idx_test = len(train_examples)
all_examples = train_examples + df_test.example.tolist()

# Elastic to prepare text data

In [6]:
query = {
  "tokenizer" : "classic",
  "filter" : [
              "lowercase",
              "asciifolding",
              {"type": "stop", "stopwords": "_portuguese_"},
              {"type": "stemmer", "language": "brazilian"}],
  "text" : ""
}

examples_text_without_stopwords = []
for example in all_examples:
    query["text"] = example
    result = es.indices.analyze(index=index.index_name, body=query)
    new_text = " ".join([token["token"] for token in result["tokens"]])
    examples_text_without_stopwords.append(new_text)

examples_text_without_stopwords[:5]

['agu barrent',
 'quer extrat pagament ultim mes',
 'contamin agu',
 'convers rob',
 'quer consult cpf']

# One Hot Enconding the intents

In [7]:
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder

def intent_to_onehot(intents_names):
    data = asarray([ [intent_name]  for intent_name in intents_names])
    # define one hot encoding
    encoder = OneHotEncoder(sparse=False)
    # transform data
    intents_names_as_onehot = encoder.fit_transform(data)
    return intents_names_as_onehot

In [8]:
intents_name = df_train.intent.tolist() + df_test.intent.tolist()
set_intents_name = set(intents_name)

In [9]:
set_intents_name = set(intents_name)
intents_name_as_onehot = intent_to_onehot(set_intents_name)
dic_onehot_intents = {intent: onehot for intent, onehot in zip(set_intents_name, intents_name_as_onehot)}
dic_onehot_intents["Localizar_AgenciasCEDAE"]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [10]:
import numpy as np

np.save('data/dic_onehot_intents_tfidf_cedae.npy',dic_onehot_intents)

In [11]:
intent_dictionary = {str(np.argmax(value)):key for key, value in zip(dic_onehot_intents.keys(),
                                            dic_onehot_intents.values())}

In [12]:
import json

with open("data/intents_dictionary_tfidf_cedae.json", 'w') as f:
      json.dump(intent_dictionary, f)
    
f.close()

# TF-IDF Feature encoding

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def encode_tfidf(all_examples):
    # create the transform
    vectorizer = TfidfVectorizer()
    # tokenize and build vocab
    tf_idf_corpus = vectorizer.fit(all_examples)
    features = tf_idf_corpus.transform(all_examples)
    return tf_idf_corpus, features.toarray()

In [14]:
train_examples_text_without_stopwords = examples_text_without_stopwords[:stard_idx_test]
print(len(train_examples_text_without_stopwords))
print(train_examples_text_without_stopwords[-1])

221
previsa volt agu


In [15]:
test_examples_text_without_stopwords = examples_text_without_stopwords[stard_idx_test:]
print(len(test_examples_text_without_stopwords))
print(test_examples_text_without_stopwords[-1])

106
agu cor amarel


In [16]:
start_test_idx = len(train_examples_text_without_stopwords)
all_examples = train_examples_text_without_stopwords + test_examples_text_without_stopwords
tf_idf_corpus, tf_idf = encode_tfidf(all_examples)

In [17]:
tf_idf.shape

(327, 344)

In [18]:
import pickle

#  save tf_idf_corpus instance
pickle.dump(tf_idf_corpus, open("nlp_models/tf_idf_corpus_cedae.pickle", "wb"))

In [19]:
import numpy as np

X_train = np.stack(tf_idf[:start_test_idx])
y_train = np.stack([dic_onehot_intents[intent] for intent in df_train.intent])

X_test = np.stack(tf_idf[start_test_idx:])
y_test = np.stack([dic_onehot_intents[intent] for intent in df_test.intent])

In [20]:
print(X_train.shape[0], len(train_examples_text_without_stopwords))

221 221


In [21]:
print(X_test.shape[0], len(test_examples_text_without_stopwords))

106 106


# Saving prepared data

In [22]:
import numpy as np


with open('data/X_train_tdidf_cedae.npy', 'wb') as f:
    np.save(f, X_train)
    f.close()
    
with open('data/y_train_tdidf_cedae.npy', 'wb') as f:
    np.save(f, y_train)
    f.close()
    
with open('data/X_test_tdidf_cedae.npy', 'wb') as f:
    np.save(f, X_test)
    f.close()
    
with open('data/y_test_tdidf_cedae.npy', 'wb') as f:
    np.save(f, y_test)
    f.close()