<a href="https://colab.research.google.com/github/Talendar/br_fake_news_detection/blob/main/br_fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Detection

In this notebook, we'll use *deep learning* to classify texts written in Brazilian Portuguese as true or fake. The *corpus* used was created by NILC researches and is available [here](https://github.com/roneysco/Fake.br-Corpus). Let's start by downloading the data directly from GitHub.

In [1]:
!git clone https://github.com/roneysco/Fake.br-Corpus
DATA_PATH = "./Fake.br-Corpus/size_normalized_texts"

fatal: destination path 'Fake.br-Corpus' already exists and is not an empty directory.


Dealing with the project's dependencies:

In [None]:
import warnings
warnings.filterwarnings(action='once')

import numpy as np
import pandas as pd
import os
import re
import zipfile

%tensorflow_version 2.x
import tensorflow as tf
from sklearn.utils import shuffle

from tensorflow.keras.callbacks import Callback
from IPython.display import clear_output
from gensim.models import KeyedVectors

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

# this class will be used later on
class ClearCallback(Callback):
    """ Handles the cleaning of the log during the training of a model. """

    def __init__(self, current_k, total_k):
        self._current_k = current_k
        self._total_k = total_k

    def on_epoch_end(self, epoch, logs=None):
        """ Clears the log. Called when a training epoch ends. """
        clear_output(wait=True)
        print("Running %d-folds cross-validation. Current fold: %d.\n" % (self._total_k, self._current_k))

TO_DO: Load and explore data

In [4]:
def load_txts(path):
    txts = []
    for filename in sorted(os.listdir(path), key=lambda x: int(re.match("[0-9]+", x).group())):
        with open(os.path.join(path, filename)) as f:
            txts.append(f.read())
    return txts


true_txts = load_txts(os.path.join(DATA_PATH, "true"))
fake_txts = load_txts(os.path.join(DATA_PATH, "fake"))
assert(len(true_txts) == len(fake_txts))

data = pd.DataFrame( [{"text": t, "label": 0} for t in true_txts] + [{"text": f, "label": 1} for f in fake_txts] ).sample(frac=1)
%xdel true_txts
%xdel fake_txts

pd.set_option('max_colwidth', 200)
data

Unnamed: 0,text,label
1097,"Temer faz viagem oficial para Rússia e Noruega; Maia assume Presidência. Presidente vai participar de reuniões com autoridades da Rússia e da Noruega. Em entrevista, Joesley Batista afirmou que Te...",0
3098,"STF recebe primeira ação popular contra nomeação de Lula à Casa Civil. Residente de Jaguariúna, em São Paulo, grupo alega que Dilma cometeu abuso de poder ao nomear o ex-presidente para ministro-c...",0
4959,"Lula, Dilma e FHC aparecem na lista de Janot e serão investigados. Detalhe: eles não têm foro!. FHC, Lula e Dilma foram citados na lista do PGR, Rodrigo Janot. Os casos serão investigados por in...",1
2813,"Quarta-feira, 11 de outubro de 2017. Boa noite! Aqui estão as principais notícias para você terminar o dia bem-informado. Em uma sessão bastante tumultuada e marcada por bastante discordância sobr...",0
6782,"Em 1 ano, número de desempregados aumenta 41, 5%. Na comparação com o mesmo período de 2014, o aumento foi de 41, 5%, com 2, 7 milhões de desempregados a mais, segundo o IBGE (Instituto Brasi...",1
...,...,...
454,"Quem foi Giordano Bruno, o místico visionário queimado na fogueira há 418 anos. De temperamento rebelde e contestador, ele não seria considerado um cientista nos moldes atuais - mas defendeu teori...",0
2322,"Formalmente longe do Senado, desde 1º de fevereiro de 2015, após não ser reeleito para outro mandato na Casa em 2014, Gim Argello (PTB-DF) acompanhava dos bastidores as movimentações do Congresso....",0
6997,"O ANTAGONISTA: Dilma usa agência estatal (sustentada com dinheiro público) para espionar o vice Michel Temer. . A mando do ministro da Secretaria de Governo, o governo federal está espionando, ...",1
3806,"Líder do PT, Gleise Homann promete ""detonar"" Sergio Moro. E como sempre o PT nos surpreendendo mais uma vez, a bancada escolheu nesta\nquarta-feira (8), Gleisi Hoffmann para assumir o posto de líd...",1


# BAG-OF-WORDS

In [None]:
import string
from sklearn.feature_extraction.text import CountVectorizer

STOPWORDS = nltk.corpus.stopwords.words('portuguese')
STEMMER = nltk.stem.RSLPStemmer()

In [132]:
def normalize_texts(corpus, stem):
    processed_texts = []
    counter = 0
    for i, row in corpus.iterrows():
        clear_output(wait=True)
        print("[%.2f%%] Processing text %d of %d." % (100*(counter+1)/len(corpus), counter+1, len(corpus)))
        counter += 1
        
        text = " ".join( [   
                (w if not stem else STEMMER.stem(w)) 
                    for w in nltk.tokenize.word_tokenize(row["text"]) if w not in STOPWORDS and w not in string.punctuation
        ] )
        processed_texts.append({"text": text, "label": row["label"]})
    return pd.DataFrame(processed_texts)

norm_data = normalize_texts(data, stem=True)
norm_data

[99.99%] Processing text 7200 of 7200.


Unnamed: 0,text,label
0,após denúnc vej irmã aéci publ víde chor `` vam prov ment '' após ser denunci matér revist vej nest fim seman sen aéci nev mostr indign acus receb de acord revist benedict juni execu odebrecht afi...,1
1,quatr pesso fic fer acid carr caminh br-376 motor automó sofr fer grav resgat helicópter samu nest terça-f 23 em sarand carr peg fog mandaguaçu ônibu escol bat contr mot quatr pesso fic fer acid c...,0
2,mor aceit denúnc contr lul outr 12 cas envolv síti atiba segund força-taref lav jat empreit odebrecht oa compr pag melh síti form propin ex-presid o juiz sérgi mor aceit nest terça-f 1º denúnc con...,0
3,reun urgent for sp dilm cheg capit paul encontr lul a presid futur ex-presid dilm rousseff ness tard dest são paul encontr lul dilm vai hosped hotel renaissanc ond far encontr `` secret '' ... tão...,1
4,no prim dia 2017 antig próx espírit sant orelh michel miguel eli tem lul deu-lh conselh chef ent parec promis com alç massacr result 56 mort complex penitenci anísi jobim manau am dev faz part pre...,0
...,...,...
7195,"um corr seguranç arm divid gabinet juiz sérgi mor sal audi segund and edifíci justiç feder curitib uma câm acopl comput registr prim depo açã penal luiz ináci lul silv réu acus receb r 3,7 milhã o...",0
7196,os rela propost pod alter sistem eleitor brasil próx ele admit óbvi ont dur “ fórum estad – reform polít debat ” realiz parc centr lideranç públic discuss send feit congress pan fund inter parlame...,0
7197,evandr mesquit protest cham sen verm `` quer cal sérgi mor '' ao ver problem polít econôm brasil mesquit mostr artist poli mantém anten lig real o at can declar ser contr trup sen petist formaliz ...,1
7198,investig mostr real caus alt preç pedági paran diz procur lav jat 48ª fas deflagr nest quinta-f 22 seil pesso pres o procur minist públic feder mpf carl fern sant lim afirm manhã dest quinta-f 22 ...,0


In [179]:
# k-fold cross-validation
k = 10
folds = np.split(norm_data.sample(frac=1), k)

accuracies = []
for i in range(len(folds)):
    # separating data
    test_data, test_labels = folds[i]["text"].values, folds[i]["label"].values
    training_data = np.concatenate( [folds[j]["text"].values for j in range(len(folds)) if j != i] )
    training_labels = np.concatenate( [folds[j]["label"].values for j in range(len(folds)) if j != i] )

    # extracting features
    vectorizer = CountVectorizer(max_features=1000)
    training_data = vectorizer.fit_transform(training_data).toarray()  # fit the vectorizer to the training corpus
    test_data = vectorizer.transform(test_data).toarray()  # words of the test corpus that don't appear in the training corpus will be ignored!

    # preparing model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(1e-3)),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(1e-3),
                  metrics=["accuracy"])
    
    # training
    model.fit(training_data, training_labels, epochs=50, 
              validation_data=(test_data, test_labels), 
              callbacks=[ClearCallback(i + 1, k)])
    
    # evaluating
    loss, acc = model.evaluate(test_data, test_labels)
    accuracies.append(acc)

clear_output(wait=True)
print("\n Cross-validation finished! Results:")
print(" . Mean accuracy: %.2f%%" % (100*np.mean(accuracies)))
print(" . Accuracies std: %.2f%%" % (100*np.std(accuracies)))

Running 10-folds cross-validation. Current fold: 10.


 Cross-validation finished! Results:
 . Mean accuracy: 88.65
 . Accuracies std: 1.61


# WORD EMBEDDINGS #

In [5]:
WVECTORS_LEN = 100  # dimenson of the word embeddings
MAX_TEXT_TOKENS = 200

OPTION 1: download vectors

In [None]:
# downloading vectors
if ("glove_s%d.zip" % WVECTORS_LEN) not in os.listdir():
    !wget -O {"glove_s%d.zip" % WVECTORS_LEN} {"http://143.107.183.175:22980/download.php?file=embeddings/glove/glove_s%d.zip" % WVECTORS_LEN}

if ("glove_s%d.txt" % WVECTORS_LEN) not in os.listdir():
    with zipfile.ZipFile("glove_s%d.zip" % WVECTORS_LEN, 'r') as zip_ref:
        zip_ref.extractall()

wv_pathname = "glove_s%d.txt" % WVECTORS_LEN

OPTION 2: load vectors from drive

In [6]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

wv_pathname = "/content/gdrive/My Drive/Colab Notebooks//ml_data/glove_s%d.txt" % WVECTORS_LEN

Mounted at /content/gdrive


Loading glove model (this might take a while)

In [7]:
word_vectors = KeyedVectors.load_word2vec_format(wv_pathname)

Auxiliary functions:

In [8]:
def vec_to_word(wv):
    """
    Returns the closest word (string) to the given word vector. This is an 
    expensive operation.
    """
    return word_vectors.most_similar(positive=[wv], topn=1)[0][0]


def vecs_to_txt(wv_list):
    """
    Receives a list of word vectors and returns a list of words corresponding to 
    each vector (one word per vector). This is an expensive operation.
    """
    txt = []
    for v in wv_list:
        txt.append(vec_to_word(v))
    return txt


def txt_to_vecs(txt):
    """
    Receives a list of tokens (words) and returns a numpy array with word vectors 
    corresponding to those tokens. If some word isn't found in the vocabulary, 
    it will be ignored.
    """
    vecs, ignored = [], []
    for word in txt:
        try:
            v = word_vectors[word]
            vecs.append(v)
        except KeyError:
            ignored.append(word)
    return np.array(vecs), set(ignored)


def pad(txts, mask_value):
    """
    Pad sequences shorter than the max length seuquence using the given mask value.
    """
    # find max len
    max = 0
    for t in txts:
        max = len(t) if len(t) > max else max

    # pad
    for i, t in enumerate(txts):
        if len(t) < max:
            z = np.full(shape=(max - len(t) , WVECTORS_LEN), fill_value=mask_value)
            txts[i] = np.concatenate((t, z))
    
    return np.array(txts)


def build_wv_data(corpus, mask_value):
    features, labels, ignored_tokens = [], [], []
    count = 0

    for i, row in corpus.iterrows():
        print("[%.1f%%] Processing text %d of %d." % ( 100 * (count)/len(corpus), count+1, len(corpus) ))
        count += 1

        tokens = nltk.tokenize.word_tokenize(row["text"].lower())
        if len(tokens) > MAX_TEXT_TOKENS:
            tokens = tokens[:MAX_TEXT_TOKENS]

        vecs, ign = txt_to_vecs( tokens )
        features.append(vecs)
        labels.append(row["label"])

        ignored_tokens += ign
        clear_output(wait=True)

    print("Padding texts...")
    return pad(features, mask_value), np.array(labels), \
           set(ignored_tokens)

In [9]:
MASK_VALUE = -0.123  # value to be used for the masking procedure (ignore padding)

wv_data, wv_labels, ignored_tokens = build_wv_data(corpus=data, mask_value=MASK_VALUE)
print("All texts processed! \nIgnored tokens (unique): %d\n" % len(ignored_tokens))
print(ignored_tokens)

# freeing memory
%xdel data
%xdel word_vectors

Padding texts...
All texts processed! 
Ignored tokens (unique): 5083

{'2015..', '5h', '23,6', 'flynn..', 'empreiteira..', '12,4', '9h30', 'gordofóbico', '499', "'erotização", '21/6', 'instituição..', 'almeida..', "'quadrilhão", 'larissa*', 'lamounir', '46,7', '21/03/16', '16,6', 'sáb.', '9ª', '2012/2013', "'intolerantes", 'baeta-', '256', '24/11', 'serveli', 'marcarenhas', '242', '12h03', '–o', 'buzfeed', '9h21', 'moro..', 'kevinho', '6h15', 'anticorrupção..', '1000', '587', 'partido-to', '163', '802', '440', 'fauso', '51.041.155', "'irmãos", '54', '2003/2010', "''o", 'f-15', 'hospital..', 'veiazinha', 'ereições', '166', 'digitaldub', 'prostar', 'adicionais..', 'f-35', 'oirunda', 'delegada..', '10°', '220', 'cravas-me', '761', '6ª', 'justiç', 'klintsevich', 'empresários..', '19.946', '28ª', 'andú', 'storyful', '971', '11/10', 'cag**', '233', '8,732', 'iniciou.', 'atibaia.', 'nome..', 'adequado..', 'bem-informado..', '1,59', "'bandido", 'trf4', '409/2016', 'www.livemocha.com', 'expuser

In [10]:
# k-fold cross-validation
k = 10

folds, folds_labels = shuffle(wv_data, wv_labels)
folds = np.array_split(folds, k)
folds_labels = np.array_split(folds_labels, k)

accuracies = []
for i in range(len(folds)):
    # separating data
    test_data, test_labels = folds[i], folds_labels[i]
    training_data = np.concatenate( [folds[j] for j in range(len(folds)) if j != i] )
    training_labels = np.concatenate( [folds_labels[j] for j in range(len(folds)) if j != i] )

    # preparing model
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=MASK_VALUE),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)), #, kernel_regularizer=tf.keras.regularizers.l2(3))),
        #tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),  #, kernel_regularizer=tf.keras.regularizers.l2(3))),
        #tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss="binary_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(1e-3),
                  metrics=["accuracy"])
    
    # training
    model.fit(training_data, training_labels, epochs=10, 
              validation_data=(test_data, test_labels), 
              callbacks=[ClearCallback(i + 1, k)])
    
    # evaluating
    loss, acc = model.evaluate(test_data, test_labels)
    accuracies.append(acc)

clear_output(wait=True)
print("\n Cross-validation finished! Results:")
print(" . Mean accuracy: %.2f%%" % (100*np.mean(accuracies)))
print(" . Accuracies std: %.2f%%" % (100*np.std(accuracies)))


 Cross-validation finished! Results:
 . Mean accuracy: 93.56%
 . Accuracies std: 0.85%
