# Redes Neurais e sua Implementação

In [923]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [924]:
import string
import re
import nltk
import ast
import pandas as pd
import numpy as np
import tensorflow as tf
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [925]:
df = pd.read_csv("./data/final_dataset.csv",converters={'biased_words4':ast.literal_eval})

In [926]:
df.columns

Index(['sentence', 'news_link', 'outlet', 'topic', 'type', 'group_id',
       'num_sent', 'Label_bias', 'Label_opinion', 'article', 'biased_words4',
       'full_article'],
      dtype='object')

## 1 - Features Importantes
- Label_bias (categórica): Indica se o texto foi classificado como enviesado, não-enviesado ou se não foi possível atingir um consenso quanto a classificação.
- Label_opinion (categórica): Indica de que modo o viés se manifesta na percepção dos entrevistados; especificamente separando casos de exposição de opinião do autor ou com fatos que corroborem um viés. (Um pouco fuzzy demais, talvez?)
- Biased_words(vetor): Indica as palavras marcadas como "denunciantes" da presença de viés. 
- Topic(categórica): Indica o assunto do texto, dentro das categorias PREENCHER AQUI   

## 2 - Pré-Processamento

### 2.1 - Básico textual (remoção de stopwords, filtro de expressões com números, etc)

- A priori, vamos considerar como tokens todas as palavras que:
    - tenham mais do que 3 letras
    - não possuam números
    - não estejam nas stopwords do inglês
    
- Vamos remover os acentos e, mediante a escolha do usuário, aplicar um stemmer.

In [927]:
digit_pattern = r'\d+(\.\d+)?'
solo_quotations_pattern = pattern = r"^(?:' '|\\" "|\`\`)$"

remove_quotation = r"'\b|\b'\s|\s'\b|\"\b|\b\"\s|\s\"\b|``\b|\b``\s|\s``\b" # como em "America is Great"
remove_symbols_only = r'\b[^\w\s]+\b' # como em '--'
remove_symbols_if_aside = r'\b[^\w\s]|[^\w\s]\b' # como em 'dog-'

remove_patterns = remove_quotation,remove_symbols_if_aside, remove_symbols_only

stop_words = set(stopwords.words('english')) # talvez o set deixe mais rápido

def preprocess_basic_text(data,col):
    
    prepped_texts = []

    for s in data[col]:
        # Remove acentos e põe tudo para lower case
        s = unidecode(s).lower()
        tokens = word_tokenize(s,language="english")

        tokens = [
            re.sub('|'.join(remove_patterns),'',t)
            for t in tokens 
            if not bool(re.search(digit_pattern,t))
            and t not in string.punctuation
            and t not in stop_words
        ]

        tokens = [
            t for t in tokens
            if not bool(re.match(solo_quotations_pattern,t)) 
            and len(t) >= 3
        ]

        prepped_texts.append(tokens)

    data[col] = prepped_texts

    return df 

df_functions_test_copy = df.copy()
df_functions_test_copy = preprocess_basic_text(df_functions_test_copy,'sentence')
df_functions_test_copy['sentence'].head()


0    YouTube is making clear there will be no “birt...
1    The increasingly bitter dispute between Americ...
2    So while there may be a humanitarian crisis dr...
3    A professor who teaches climate change classes...
4    The World Anti-Doping Agency on Tuesday said t...
Name: sentence, dtype: object

### 2.1 - Stemmers

- Temos o objetivo de avaliar como diferentes estratégias de stemming afetam os resultados do treinamento. A função a seguir visa abstrair esse passo para um etapa separada de pré-processamento. 
- Serão testados os stemmer de Porter e de Lancaster, além da lematização via WordNet. Vamos também obter os resultados do modelo quando utilizados sem qualquer stemmer. 

In [928]:
def apply_stemmer_individual(sentence, stemmer_type:str):

    # Função a ser executada para cada sentença, com expectativa de uso em algum filtro, por exemplo

    stemmer = {}
    lemmatizer = {}

    if stemmer_type == "Porter":
        stemmer = PorterStemmer()
        return [stemmer.stem(token) for token in sentence]

    elif stemmer_type == "Lancaster":
        stemmer = LancasterStemmer()
        return [stemmer.stem(token) for token in sentence]
    
    else:  # "Wordnet":
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.stem(token) for token in sentence]


- Aplicando stemmer para teste da função 

In [929]:
df_functions_test_copy['sentence'] = df_functions_test_copy['sentence'].apply(lambda x: apply_stemmer_individual(x,"Porter"))
df_functions_test_copy['sentence'].head()

0    [y, o, u, t, u, b, e,  , i, s,  , m, a, k, i, ...
1    [t, h, e,  , i, n, c, r, e, a, s, i, n, g, l, ...
2    [s, o,  , w, h, i, l, e,  , t, h, e, r, e,  , ...
3    [a,  , p, r, o, f, e, s, s, o, r,  , w, h, o, ...
4    [t, h, e,  , w, o, r, l, d,  , a, n, t, i, -, ...
Name: sentence, dtype: object

In [930]:
# generalizando a aplicação pra todas as instâncias
def apply_stemmer_for_all(data,cols,stemmer_flag=False,stemmer_type=None):
  
  if stemmer_flag == False:
    return data

  for col in cols:
    data[col] = data[col].apply(lambda x: apply_stemmer_individual(x,stemmer_type))
  return data

- Aplicando stemmer nas palavras enviesadas também

In [931]:
df_functions_test_copy['biased_words4'] = df_functions_test_copy['biased_words4'].apply(lambda x: apply_stemmer_individual(x,"Porter"))
df_functions_test_copy['biased_words4'][:5]

0    [belat, birther]
1            [bitter]
2             [crisi]
3           [legitim]
4                  []
Name: biased_words4, dtype: object

### 2.3 - Separando o dataset

In [932]:
def train_valid_test_split(features, labels):
    """ Retorna uma lista de tuplas contendo os datasets de features e de labels para cada segmento (treino, validação, teste) """
    
    # Treino-val e Teste
    shuffle_train_test = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=892)
    train_val_indexes, test_indexes = next(shuffle_train_test.split(features.values, labels.values))
    train_val_df, train_val_labels = features.iloc[train_val_indexes], labels.iloc[train_val_indexes]
    test_df, test_labels = features.iloc[test_indexes], labels.iloc[test_indexes]

    # Treino e Validação
    shuffle_train_validate = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=124)
    train_indexes, validation_indexes = next(shuffle_train_validate.split(train_val_df.values,train_val_labels.values))
    train_df, train_labels = features.iloc[train_indexes], labels.iloc[train_indexes]
    validation_df, validation_labels = features.iloc[validation_indexes], labels.iloc[validation_indexes]


    return [(train_df, train_labels), (validation_df, validation_labels), (test_df, test_labels)]


In [933]:
train, val, test = train_valid_test_split(df_functions_test_copy,df_functions_test_copy['Label_bias'])
train[0].shape

(848, 12)

### 2.4 - Encoding (palavras)

- Para representar as palavras, vamos criar um encoding com base na frequência apresentada no corpus.
- Utilizaremos alguns números para representar determinadas semânticas:
  - padding("pad"), necessário para deixar todas as entradas com mesmo tamanho, terá o valor 0.
  - start-of-sequence("sos"), usado para identificar o ínicio de cada sentença, terá o número 1
  - unknown, para marcar palavras desconhecidas, terá o número 2

In [934]:
train_df_copy = train[0]

In [935]:
SOS = "<sos>"
PADDING = "<pad>"
UNKNOWN = "<ukn>"

In [936]:
def add_padding_and_sos(data):

    max_length = max(len(seq) for seq in data['sentence'])
    for index,row in data.iterrows():
      sentence = [SOS] + row['sentence']
      paddings = (max_length - len(row['sentence'])) * [PADDING]
      sentence += paddings
      data.at[index,'sentence'] = sentence
    
    return data

train_df_copy = add_padding_and_sos(train_df_copy)

In [937]:
print(train_df_copy.loc[190,'sentence'])

['<sos>', 'e', 'v', 'e', 'n', ' ', 't', 'h', 'o', 'u', 'g', 'h', ' ', 't', 'h', 'u', 'n', 'b', 'e', 'r', 'g', ' ', 'i', 's', ' ', 'c', 'a', 'p', 'a', 'b', 'l', 'e', ' ', 'o', 'f', ' ', 'd', 'o', 'i', 'n', 'g', ' ', 't', 'h', 'i', 'n', 'g', 's', ' ', 'm', 'a', 'n', 'y', ' ', 'o', 'n', ' ', 't', 'h', 'e', ' ', 'r', 'i', 'g', 'h', 't', ' ', 'r', 'e', 'f', 'u', 's', 'e', ' ', 't', 'o', ' ', 'd', 'o', ' ', '—', ' ', 's', 'u', 'c', 'h', ' ', 'a', 's', ' ', 'r', 'e', 'a', 'd', ' ', 'a', 'n', 'd', ' ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd', ' ', 's', 'c', 'i', 'e', 'n', 't', 'i', 'f', 'i', 'c', ' ', 'e', 'v', 'i', 'd', 'e', 'n', 'c', 'e', ' ', '—', ' ', 'r', 'i', 'g', 'h', 't', '-', 'w', 'i', 'n', 'g', 'e', 'r', 's', ' ', 'a', 'r', 'g', 'u', 'e', 'd', ' ', 't', 'h', 'a', 't', ' ', 't', 'h', 'u', 'n', 'b', 'e', 'r', 'g', ' ', 'a', 'n', 'd', ' ', 'h', 'e', 'r', ' ', 'f', 'e', 'l', 'l', 'o', 'w', ' ', 'y', 'o', 'u', 't', 'h', ' ', 'a', 'c', 't', 'i', 'v', 'i', 's', 't', 's', ' ', 'c', 

- Construindo o vocabulário com base na frequência de aparecimento.
  - *Obs*: note a colocação de determinadas tags logo no ínicio

In [938]:
def create_vocabulary(data):
  """ Retorna uma lookup table (StaticVocabularyTable) """  

  vocab = Counter()
  for sentence in train_df_copy['sentence']:  
      vocab.update(sentence)

  words = tf.constant([PADDING] + 
                      [SOS] + 
                      [UNKNOWN] + 
                      [word for word in vocab.keys() if word != PADDING and word != SOS]
  )
  words_ids = tf.range(len(vocab)+1, dtype=tf.int64) # +1 porque unknown é o único não mapeado
  vocab_init = tf.lookup.KeyValueTensorInitializer(words,words_ids)
  num_oov_buckets = 5000 # bucket para palavras desconhecidas
  vocab_table = tf.lookup.StaticVocabularyTable(vocab_init,num_oov_buckets)
  
  return vocab_table

- Demonstração de como estão os encodings de algumas palavras. 

In [939]:
vocab_table = create_vocabulary(train_df_copy)
vocab_table.lookup(tf.constant([b'nice trump move'.split(b' ')]))

<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[2152, 3826,  846]])>

- Convertendo os tokens para o índice na tabela (tanto da sentença, como das palavras que denotam viés)

In [940]:
def convert_words_to_freq(data, vocab):

  for index, row in data.iterrows():
    
    sentence = tf.constant(row['sentence'])
    biased_words = tf.constant(row['biased_words4'])

    sentence_ided = vocab_table.lookup(sentence).numpy()
    
    biased_words_ided = vocab_table.lookup(biased_words).numpy() if len(biased_words) > 0 else []
    
    data.at[index, 'sentence'] = sentence_ided
    data.at[index, 'biased_words4'] = biased_words_ided


  return data

train_df_copy = convert_words_to_freq(train_df_copy, vocab_table)
train_df_copy.head(5)

Unnamed: 0,sentence,news_link,outlet,topic,type,group_id,num_sent,Label_bias,Label_opinion,article,biased_words4,full_article
129,"[1, 3, 4, 5, 6, 7, 8, 9, 4, 10, 3, 11, 6, 12, ...",https://www.breitbart.com/politics/2020/01/15/...,breitbart,student-debt,right,8,1,Non-biased,Entirely factual,A solid majority of student loan borrowers in ...,[],('ProPublica is a Pulitzer Prize-winning inves...
684,"[1, 3, 4, 28, 6, 7, 7, 4, 12, 17, 7, 17, 3, 5,...",https://www.breitbart.com/politics/2019/01/15/...,breitbart,abortion,right,41,1,Non-biased,Entirely factual,A poll released Tuesday finds 75 percent of Am...,[2779],"('Discover Thomson Reuters', 'By Reuters Staff..."
538,"[1, 8, 18, 4, 24, 25, 33, 29, 31, 4, 13, 23, 1...",https://www.huffpost.com/entry/muslim-teen-dis...,huffpost,sport,left,32,1,Non-biased,Entirely factual,HuffPost is part of Verizon Media. We and our ...,[5004],"('', '', ""Despite a pandemic, one-sided presid..."
573,"[1, 3, 7, 13, 23, 6, 16, 26, 23, 4, 16, 27, 5,...",https://www.alternet.org/2019/09/everything-yo...,alternet,sport,left,34,1,Biased,Expresses writer’s opinion,"Since 1983, Kim Karsh has helped baseball team...",[3372],('Chinese state media outlets have been bursti...
11,"[1, 19, 16, 13, 4, 6, 18, 17, 4, 26, 7, 3, 12,...",https://www.nbcnews.com/news/world/wave-vaccin...,msnbc,vaccines,left,1,1,Biased,No agreement,LONDON — The coronavirus crosses borders witho...,[2871],"('', 'Profile', 'Sections', 'tv', 'Featured', ..."


### 2.5 - One-hot encoding para features categóricas

In [941]:
def cats_to_one_hot(data,column_names):
  for col in column_names:
    df_encoded = pd.get_dummies(df[col], prefix=col)
    cat_col_index = data.columns.get_loc(col)
    data = pd.concat([data.iloc[:,:cat_col_index], df_encoded, data.iloc[:,cat_col_index+1:]],axis=1)
    

  return data

In [942]:
train_df_copy = cats_to_one_hot(train_df_copy, column_names=['topic','outlet','type','Label_bias','Label_opinion'])
train_df_copy.head(3)

Unnamed: 0,sentence,news_link,outlet_alternet,outlet_breitbart,outlet_federalist,outlet_fox-news,outlet_huffpost,outlet_msnbc,outlet_reuters,outlet_usa-today,...,Label_bias_Biased,Label_bias_No agreement,Label_bias_Non-biased,Label_opinion_Entirely factual,Label_opinion_Expresses writer’s opinion,Label_opinion_No agreement,Label_opinion_Somewhat factual but also opinionated,article,biased_words4,full_article
129,"[1, 3, 4, 5, 6, 7, 8, 9, 4, 10, 3, 11, 6, 12, ...",https://www.breitbart.com/politics/2020/01/15/...,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,A solid majority of student loan borrowers in ...,[],('ProPublica is a Pulitzer Prize-winning inves...
684,"[1, 3, 4, 28, 6, 7, 7, 4, 12, 17, 7, 17, 3, 5,...",https://www.breitbart.com/politics/2019/01/15/...,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,A poll released Tuesday finds 75 percent of Am...,[2779],"('Discover Thomson Reuters', 'By Reuters Staff..."
538,"[1, 8, 18, 4, 24, 25, 33, 29, 31, 4, 13, 23, 1...",https://www.huffpost.com/entry/muslim-teen-dis...,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,HuffPost is part of Verizon Media. We and our ...,[5004],"('', '', ""Despite a pandemic, one-sided presid..."


### 2.6 - Limpando features

In [943]:
def drop_columns(data, column_names):
  return data.drop(column_names,axis=1)

In [944]:
train_df_copy = drop_columns(train_df_copy,column_names=['news_link','group_id','article','full_article'])
train_df_copy.head(1)

Unnamed: 0,sentence,outlet_alternet,outlet_breitbart,outlet_federalist,outlet_fox-news,outlet_huffpost,outlet_msnbc,outlet_reuters,outlet_usa-today,topic_abortion,...,type_right,num_sent,Label_bias_Biased,Label_bias_No agreement,Label_bias_Non-biased,Label_opinion_Entirely factual,Label_opinion_Expresses writer’s opinion,Label_opinion_No agreement,Label_opinion_Somewhat factual but also opinionated,biased_words4
129,"[1, 3, 4, 5, 6, 7, 8, 9, 4, 10, 3, 11, 6, 12, ...",0,1,0,0,0,0,0,0,0,...,1,1.0,0,0,1,1,0,0,0,[]


### 2.8 - Compilando pipeline de pré-processamento

In [945]:
class VocabularyConversionTransformer(BaseEstimator,TransformerMixin):
  def __init__(self):
    self.vocab_table = None

  def fit(self, X, y=None):
    self.vocab_table = create_vocabulary(X)
    return self
  
  def transform(self,X,y=None):
    return convert_words_to_freq(X,vocab_table=self.vocab_table)

In [946]:
def preprocess(data,stemmer_flag=False,stemmer_type=None):
  
  one_hot_columns = ['topic','outlet','type','Label_bias','Label_opinion']
  drop_columns = ['news_link','group_id','article','full_article']

  preprocess_pipeline = ([
      ('Tokenização (sentenças)', FunctionTransformer(preprocess_basic_text,kw_args={'col':'sentence'})),
      #('Tokenização (biased_words)', FunctionTransformer(preprocess_basic_text,kw_args={'col':'biased_words4'})),
      #('Stemming', FunctionTransformer(apply_stemmer_for_all, kw_args={'cols':['sentence','biased_words4'],
      #                                                                'stemmer_flag': stemmer_flag, 
      #                                                                'stemmer_type': stemmer_type
      #                                                                })),
      #('Adding SOS and Padding', add_padding_and_sos),
      #('Converting words to its integer rep', VocabularyConversionTransformer()),
      #('One-hot-encoding', FunctionTransformer(cats_to_one_hot,kw_args={'column_names':one_hot_columns})),
      #('Dropping features', FunctionTransformer(cats_to_one_hot,kw_args={'column_names':drop_columns})),
  ])

  pipeline = Pipeline(preprocess_pipeline)

  return pipeline.transform(data)

In [947]:
data = preprocess(df.copy())

AttributeError: ignored