In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from model_selection import train_test_split_per_class, cross_validation_per_class
from sklearn.metrics import classification_report
from joblib import dump
from statistics import *

In [2]:
tf.config.run_functions_eagerly(True)

In [3]:
df_v1 = pd.read_excel('preprocessed_lemmatization.xlsx')
df_v2 = pd.read_excel('preprocessed_stemming.xlsx')

In [4]:
df_v1.fillna('', inplace=True)
df_v2.fillna('', inplace=True)

In [5]:
df_v1.head()

Unnamed: 0,antecedente,comportamento,consequencia,processo,efeito
0,problema conflito relacionamento interpessoal,conversar aproximar expressar sentimento opinião,crítico represália,p+,tristeza
1,problema conflito relacionamento interpessoal,conversar aproximar expressar sentimento opinião,problema continuar,p+,insegurança
2,problema conflito relacionamento interpessoal,conversar aproximar expressar sentimento opinião,interessar atenção pessoa,ext,frustração
3,conflito relacionamento amoroso presença namorar,patrício converso namorar sobrar incomodar,mudo assunto preferir deixar acontecer,ext,tristeza frustração insegurança
4,conflito relacionamento amoroso presença namorar,patrício converso namorar sobrar incomodar,repetição problema ocorrência problema relacio...,p+,tristeza frustração insegurança


In [6]:
df_v2.head()

Unnamed: 0,antecedente,comportamento,consequencia,processo,efeito
0,problem conflit relacion interpesso,convers aproxim express sent opin,crít represál,p+,trist
1,problem conflit relacion interpesso,convers aproxim express sent opin,problem continu,p+,inseguranç
2,problem conflit relacion interpesso,convers aproxim express sent opin,inter atenç pesso,ext,frustr
3,conflit relacion amor presenç namor,patríc convers namor incomod,mud assunt pref deix cois acontec,ext,trist frustr inseguranç
4,conflit relacion amor presenç namor,patríc convers namor incomod,repet problem ocorr problem relacion,p+,trist frustr inseguranç


In [7]:
x_v1 = df_v1.drop('processo', axis=1)
y_v1 = df_v1['processo']

In [8]:
x_v2 = df_v2.drop('processo', axis=1)
y_v2 = df_v2['processo']

### Passo 1: Extração do atributo TF-IDF

In [9]:
tfidf_v1 = TfidfVectorizer()
tfidf_v2 = TfidfVectorizer()

In [10]:
x_concat_v1 = x_v1['antecedente'] + '\n' + x_v1['comportamento'] + '\n' + x_v1['consequencia'] + '\n' + x_v1['efeito']
x_concat_v2 = x_v2['antecedente'] + '\n' + x_v2['comportamento'] + '\n' + x_v2['consequencia'] + '\n' + x_v2['efeito']

In [11]:
tfidf_v1.fit(x_concat_v1)

TfidfVectorizer()

In [12]:
tfidf_v2.fit(x_concat_v2)

TfidfVectorizer()

In [13]:
x_tfidf_v1 = {
    'antecedente': tfidf_v1.transform(x_v1['antecedente']).toarray(),
    'comportamento': tfidf_v1.transform(x_v1['comportamento']).toarray(),
    'consequencia': tfidf_v1.transform(x_v1['consequencia']).toarray(),
    'efeito': tfidf_v1.transform(x_v1['efeito']).toarray()
}

In [14]:
x_tfidf_v2 = {
    'antecedente': tfidf_v2.transform(x_v2['antecedente']).toarray(),
    'comportamento': tfidf_v2.transform(x_v2['comportamento']).toarray(),
    'consequencia': tfidf_v2.transform(x_v2['consequencia']).toarray(),
    'efeito': tfidf_v2.transform(x_v2['efeito']).toarray()
}

In [15]:
x_features_v1 = np.stack(
    [
        x_tfidf_v1['antecedente'],
        x_tfidf_v1['comportamento'],
        x_tfidf_v1['consequencia'],
        x_tfidf_v1['efeito']
    ], 
    axis=2
)

In [16]:
new_shape = x_features_v1.shape + (1,)
x_features_v1 = np.reshape(x_features_v1, new_shape)

In [17]:
x_features_v2 = np.stack(
    [
        x_tfidf_v2['antecedente'],
        x_tfidf_v2['comportamento'],
        x_tfidf_v2['consequencia'],
        x_tfidf_v2['efeito']
    ], 
    axis=2
)

In [18]:
new_shape = x_features_v2.shape + (1,)
x_features_v2 = np.reshape(x_features_v2, new_shape)

In [19]:
print('Shape V1 = {}'.format(x_features_v1.shape))
print('Shape V2 = {}'.format(x_features_v2.shape))

Shape V1 = (185, 694, 4, 1)
Shape V2 = (185, 644, 4, 1)


### Passo 2: Mapear X para números

In [20]:
y_map = {
    'p-': 0,
    'p+': 1,
    'ext': 2,
    'r-': 3,
    'r+': 4
}

In [21]:
y_code_v1 = y_v1.map(y_map)
y_code_v2 = y_v2.map(y_map)

### Passo 3: Transformar em uma codificação "One hot"

In [22]:
x_train_v1, x_test_v1, y_train_v1, y_test_v1 = train_test_split_per_class(x_features_v1, y_code_v1.to_numpy())
x_train_v2, x_test_v2, y_train_v2, y_test_v2 = train_test_split_per_class(x_features_v2, y_code_v2.to_numpy())

In [23]:
y_train_one_hot_v1 = tf.one_hot(y_train_v1, 5)
y_test_one_hot_v1  = tf.one_hot(y_test_v1, 5)
y_train_one_hot_v2 = tf.one_hot(y_train_v2, 5)
y_test_one_hot_v2  = tf.one_hot(y_test_v2, 5)

### Passo 4.1: Rede Neural aplicada a base V1

In [24]:
model_v1 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(64, 7, activation='relu', input_shape=(x_features_v1.shape[1],x_features_v1.shape[2], x_features_v1.shape[3]), padding='same'),
    tf.keras.layers.MaxPooling2D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax'),
])

In [25]:
model_v1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 694, 4, 64)        3200      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 347, 2, 64)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 44416)             0         
                                                                 
 dense (Dense)               (None, 128)               5685376   
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                        

In [26]:
model_v1.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
model_v1.fit(x_train_v1, y_train_one_hot_v1, epochs=15, batch_size=32, verbose=1)

Epoch 1/15




Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x27dc32f4cd0>

In [28]:
results_v1 = model_v1.evaluate(x_test_v1, y_test_one_hot_v1)



### Passo 4.2: Rede Neural aplicada a base V2

In [29]:
model_v2 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(64, 7, activation='relu', input_shape=(x_features_v2.shape[1],x_features_v2.shape[2], x_features_v2.shape[3]), padding='same'),
    tf.keras.layers.MaxPooling2D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax'),
])

In [30]:
model_v2.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
model_v2.fit(x_train_v2, y_train_one_hot_v2, epochs=15, batch_size=32, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x27dc50a1e50>

In [32]:
results_v2 = model_v2.evaluate(x_test_v2, y_test_one_hot_v2)



In [33]:
y_pred_v1 = np.argmax(model_v1.predict(x_test_v1), axis=1)
y_pred_v2 = np.argmax(model_v2.predict(x_test_v2), axis=1)

In [34]:
y_true_v1 = np.argmax(y_test_one_hot_v2.numpy(), axis=1)
y_true_v2 = np.argmax(y_test_one_hot_v2.numpy(), axis=1)

In [35]:
print(classification_report(y_true_v1, y_pred_v1))

              precision    recall  f1-score   support

           0       0.33      0.25      0.29         4
           1       0.72      0.87      0.79        15
           2       0.00      0.00      0.00         3
           3       0.93      0.93      0.93        14
           4       0.91      0.91      0.91        22

    accuracy                           0.81        58
   macro avg       0.58      0.59      0.58        58
weighted avg       0.78      0.81      0.79        58



In [36]:
print(classification_report(y_true_v2, y_pred_v2))

              precision    recall  f1-score   support

           0       0.33      0.25      0.29         4
           1       0.75      0.80      0.77        15
           2       0.00      0.00      0.00         3
           3       0.67      1.00      0.80        14
           4       0.88      0.68      0.77        22

    accuracy                           0.72        58
   macro avg       0.53      0.55      0.53        58
weighted avg       0.71      0.72      0.70        58



### Passo 5.1: Cross Validation com a base V1

In [37]:
results = cross_validation_per_class(x_features_v1, y_code_v1.to_numpy(), model_v1, 15, 'adam','binary_crossentropy',['accuracy'], 10, 32)

Fold 0/7




Fold 1/7
Fold 2/7
Fold 3/7
Fold 4/7
Fold 5/7
Fold 6/7


In [38]:
accuracy_v1 = list(map(list, zip(*results)))[1]

### Passo 5.2: Cross Validation com a base V2

In [39]:
results = cross_validation_per_class(x_features_v2, y_code_v2.to_numpy(), model_v2, 10, 'adam','binary_crossentropy',['accuracy'], 15, 32)

Fold 0/7
Fold 1/7
Fold 2/7
Fold 3/7
Fold 4/7
Fold 5/7
Fold 6/7


In [40]:
accuracy_v2 = list(map(list, zip(*results)))[1]

In [41]:
print("Média V1: {}".format(mean(accuracy_v1)))

Média V1: 0.813684710434505


In [42]:
print("Mediana V1: {}".format(median(accuracy_v1)))

Mediana V1: 0.8399999737739563


In [43]:
print("Desvio Padrão Populacional V1: {}".format(pstdev(accuracy_v1)))

Desvio Padrão Populacional V1: 0.07650151467457779


In [44]:
print("Desvio padrão amostral V1: {}".format(stdev(accuracy_v1)))

Desvio padrão amostral V1: 0.0826310799402304


In [45]:
print("Média V2: {}".format(mean(accuracy_v2)))

Média V2: 0.7899007371493748


In [46]:
print("Mediana V2: {}".format(median(accuracy_v2)))

Mediana V2: 0.800000011920929


In [47]:
print("Desvio Padrão Populacional V2: {}".format(pstdev(accuracy_v2)))

Desvio Padrão Populacional V2: 0.04870607197842037


In [48]:
print("Desvio padrão amostral V2: {}".format(stdev(accuracy_v2)))

Desvio padrão amostral V2: 0.05260857048835526


### Passo 6.1: Criar rede neural com base V1 completa (sem divisão de treino e teste)

In [49]:
y_one_hot_v1 = tf.one_hot(y_code_v1.to_numpy(), 5)

In [50]:
model_v1_final = tf.keras.models.clone_model(model_v1)

In [51]:
model_v1_final.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])

In [52]:
model_v1_final.fit(x_features_v1, y_one_hot_v1, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27dc4c239d0>

### Passo 6.2: Criar rede neural com base V2 completa (sem divisão de treino e teste)

In [53]:
y_one_hot_v2 = tf.one_hot(y_code_v2.to_numpy(), 5)

In [54]:
model_v2_final = tf.keras.models.clone_model(model_v2)

In [55]:
model_v2_final.compile(optimizer = 'adam', loss='binary_crossentropy', metrics=['accuracy'])

In [56]:
model_v2_final.fit(x_features_v2, y_one_hot_v2, epochs=15, batch_size=32, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x27dc337dd90>

### Passo 7: Salvar redes neurais

In [57]:
model_v1_final.save_weights('./checkpoints_lemmanization/my_model.h5')
model_v2_final.save_weights('./checkpoints_stemming/my_model.h5')

In [58]:
dump(tfidf_v1, './checkpoints_lemmanization/tfidf.joblib')
dump(tfidf_v2, './checkpoints_stemming/tfidf.joblib')

['./checkpoints_stemming/tfidf.joblib']