# Redes Neurais

Esse notebook tratará da implementação, do treino e da avaliação inicial de redes neurais recorrentes (especificamente de células LSTM) na busca pela solução do seguinte problema de classificação: identificação de de viés político em textos. O tamanho desses textos é algo a ser arbitrado e até mesmo utilizado como comparação: podemos utilizar somente as sentenças que foram marcadas por conter viés, ou mesmo o texto na íntegra da notícia.

In [118]:
import numpy as np
import pickle
import ast
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit 
from tensorflow import keras
import tensorflow as tf

- Carregamento do dataset e do vocabulário, totalmente perfilados pelo notebook de pré-processamento.

In [124]:
data = pd.read_csv('../data/final_dataset.csv')
data[['sentence','article','biased_words4']] = data[['sentence','article','biased_words4']].apply(lambda x:x.apply(ast.literal_eval))
data.loc[1,'sentence'] 



SyntaxError: invalid syntax (<unknown>, line 1)

In [90]:
data.columns

Index(['sentence', 'outlet_alternet', 'outlet_breitbart', 'outlet_federalist',
       'outlet_fox-news', 'outlet_huffpost', 'outlet_msnbc', 'outlet_reuters',
       'outlet_usa-today', 'topic_abortion', 'topic_coronavirus',
       'topic_elections-2020', 'topic_environment', 'topic_gender',
       'topic_gun-control', 'topic_immigration',
       'topic_international-politics-and-world-news', 'topic_middle-class',
       'topic_sport', 'topic_student-debt', 'topic_trump-presidency',
       'topic_vaccines', 'topic_white-nationalism', 'type_center', 'type_left',
       'type_right', 'num_sent', 'Label_bias_Biased',
       'Label_bias_No agreement', 'Label_bias_Non-biased', 'article',
       'biased_words4'],
      dtype='object')

In [91]:
with open('../data/vocabulary_table.pkl', 'rb') as f:
    vocab_dict = pickle.load(f)

keys = list(vocab_dict.keys())
values = list(vocab_dict.values())
n_oov = 5000

vocab_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(keys, values, tf.string, tf.int64),
    n_oov
)


## 1 - Separando o dataset

In [92]:
label_columns = ['Label_bias_Biased','Label_bias_No agreement', 'Label_bias_Non-biased']
labels = data[label_columns]
features = data.drop(label_columns, axis=1)

In [93]:
def train_valid_test_split(features, labels):
    """ Retorna uma lista de tuplas contendo os datasets de features e de labels para cada segmento (treino, validação, teste) """

    # Treino-val e Teste
    shuffle_train_test = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=892)
    train_val_indexes, test_indexes = next(shuffle_train_test.split(features.values, labels.values))
    train_val_df, train_val_labels = features.iloc[train_val_indexes], labels.iloc[train_val_indexes]
    test_df, test_labels = features.iloc[test_indexes], labels.iloc[test_indexes]

    # Treino e Validação
    shuffle_train_validate = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=124)
    train_indexes, validation_indexes = next(shuffle_train_validate.split(train_val_df.values,train_val_labels.values))
    train_df, train_labels = features.iloc[train_indexes], labels.iloc[train_indexes]
    validation_df, validation_labels = features.iloc[validation_indexes], labels.iloc[validation_indexes]


    return [(train_df, train_labels), (validation_df, validation_labels), (test_df, test_labels)]

In [94]:
train, validation, test = train_valid_test_split(features,labels)

train_df, train_labels = train
validation_df, validation_labels = validation
test_df, test_labels = test

print(f"Treino:{train_df.shape} ----- Validação: {validation_df.shape} ------ Teste: {test_df.shape}")

Treino:(1014, 29) ----- Validação: (338, 29) ------ Teste: (338, 29)


## 2 - Estruturando arquitetura da rede neural

In [95]:
embed_size = 128

# parte textual
text_input_length = len(train_df.loc[0,'article']) + len(train_df.loc[0,'biased_words4']) + len(train_df.loc[0,'sentence'])
text_input = keras.layers.Input(shape=(text_input_length,))
embedding_layer = keras.layers.Embedding(input_dim=int(vocab_table.size())+5000,
                                         output_dim=embed_size,
                                         mask_zero=True)(text_input)

embedding_output = keras.layers.Bidirectional(keras.layers.LSTM(embed_size))(embedding_layer) 


# outras features 
feature_input_length = len(train_df.columns) - 3
feature_input = keras.layers.Input(shape=(feature_input_length,))

concatenated = keras.layers.concatenate([embedding_output,feature_input], axis=-1)
reshaped = keras.layers.Reshape((-1, concatenated.shape[-1]))(concatenated)  # Add time dimension
gru_layer1 = keras.layers.GRU(64)(reshaped)
output = keras.layers.Dense(3, activation="softmax")(gru_layer1)

model = tf.keras.Model(inputs=[text_input,feature_input], outputs=output)
#model.summary()
model.compile(loss="categorical_crossentropy",optimizer="adam")


2023-06-14 01:32:20.659539: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-14 01:32:20.665815: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-14 01:32:20.667932: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [96]:
text_cols = ['sentence','article','biased_words4']
others = list(filter(lambda x: x if x not in text_cols else [],train_df.columns))

train_df = pd.concat([train_df[text_cols], train_df[others]],axis=1).astype(int)
validation_df = pd.concat([validation_df[text_cols], validation_df[others]],axis=1)
test_df = pd.concat([test_df[text_cols], test_df[others]],axis=1)

In [117]:
#train_df = train_df.explode('sentence')

SyntaxError: invalid syntax (<string>, line 1)

In [102]:
train_df['outlet_alternet'].head(2)

93     0
526    0
Name: outlet_alternet, dtype: int64

In [103]:
model.fit(train_df, train_labels, batch_size=32, epochs=10, validation_data=(validation_df, validation_labels))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).