#Instalação das bibliotecas necessárias

In [1]:
pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [2]:
import gdown

#Realizando download dos dados do Google Drive
!gdown --id '1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef' #id do link de compartilhamento (Notícias)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef
From (redirected): https://drive.google.com/uc?id=1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef&confirm=t&uuid=1cfdaed5-6928-4308-8e24-040008a5c68e
To: /content/fake-news.zip
100% 48.7M/48.7M [00:01<00:00, 36.3MB/s]


In [3]:
#Descompactando dataset para a raiz do notebook
!unzip /content/fake-news.zip

Archive:  /content/fake-news.zip
  inflating: submit.csv              
  inflating: test.csv                
  inflating: train.csv               


#Leitura da base de dados

In [1]:
import pandas as pd

Descrição da base de dados

1.   submit.csv -> armazena as 'rótulos' de "Verdadeiro ou False"
2.   train.csv -> contém as informações para o treinamento do algoritmo.
3.   test.csv -> contém as informações para o teste do algoritmo.



In [2]:
data = pd.read_csv("train.csv")
data[0:5]

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
test = pd.read_csv("test.csv")
test["label"] = pd.read_csv("submit.csv")["label"]
test[0:5]

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1


In [4]:
all_data = pd.concat([data, test], ignore_index=True)
all_data[20798:20803]

Unnamed: 0,id,title,author,text,label
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1
20799,20799,What Keeps the F-35 Alive,David Swanson,"David Swanson is an author, activist, journa...",1
20800,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
20801,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
20802,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0


In [5]:
#Temos 26000 notícias
all_data.shape

(26000, 5)

#Pré-processamento de texto

##Definindo Lista de palavras a serem removidas

In [7]:
#Carregamento de uma lista de StopWords
!gdown --id '1vETI4GzaYU0UbTV7VwV1WBk4ZH3AwEQV' #id do link de compartilhamento (Lista de Palavras a serem removidas)

Downloading...
From: https://drive.google.com/uc?id=1vETI4GzaYU0UbTV7VwV1WBk4ZH3AwEQV
To: /content/stop-word-list.txt
100% 4.86k/4.86k [00:00<00:00, 7.70MB/s]


In [6]:
stop_words = pd.read_csv("stop-word-list.txt", header=None)
stop_words[0:5], len(stop_words)

(       0
 0      a
 1   able
 2  about
 3  above
 4   abst,
 670)

In [7]:
remove_words = []
for i in range(len(stop_words)):
  palavra = stop_words[0].iloc[i]
  remove_words.append(palavra)

In [8]:
def verifica_area_da_saude(noticia):
  palavras_chave = ["health", "covid", "vaccine", "hospital", "medical", "insurance"]
  for keyword in palavras_chave:
    if keyword in noticia:
      return True
  return False #Caso nenhuma das palavras acima estejam na notícia, vamos descartá-la.

##Processando texto

In [9]:
import re
import string
dataset = [] #Armazenar as notícias processadas
vocabulario = {} #Armazena o vocabulário de TODAS as noticias
target = [] #Armazena as classes de cada notícia (1: Fake, 0: Not-Fake)

#Percorrendo TODAS as noticias
n_noticias = len(all_data)
for noticia in range(n_noticias):
  #Permite identificar e remover valores Nulos
  if type(all_data.iloc[noticia]["text"]) != float and type(all_data.iloc[noticia]["label"]) != float:
    #Padronização: converter todas as palavras para minúsculo.
    texto = all_data.iloc[noticia]["text"].lower()
    
    #Remoção de Pontuação e Caracteres Especiais
    texto = texto.translate(str.maketrans('', '', string.punctuation))
    texto = re.sub(r'[.,"\'-?:!;“”]', '', texto)

    #Remoção de Números
    texto = re.sub('[0-9]', '', texto)
    
    #Tokenização: Converter um texto em várias palavras separadas.
    noticia_atual = texto.split()
    
    noticia_processada = []
    if verifica_area_da_saude(noticia_atual):
      n_palavras = len(noticia_atual)
      for indice in range(n_palavras):
        palavra_atual = noticia_atual[indice]
        #Desconsidera as 'Stopwords': palavras que são menos importantes.
        if palavra_atual not in remove_words:
          noticia_processada.append(palavra_atual)
      
          #Criação do vocabulário
          if palavra_atual in vocabulario:
            vocabulario[palavra_atual] += 1
          else:
            vocabulario[palavra_atual] = 1
    
      #Armazena a noticia processada
      dataset.append(noticia_processada)
      target.append(all_data.iloc[noticia]["label"])
    
    if noticia % 1000 == 0:
        print(noticia, "noticias processadas...")

0 noticias processadas...
1000 noticias processadas...
2000 noticias processadas...
3000 noticias processadas...
4000 noticias processadas...
5000 noticias processadas...
6000 noticias processadas...
7000 noticias processadas...
8000 noticias processadas...
9000 noticias processadas...
10000 noticias processadas...
11000 noticias processadas...
12000 noticias processadas...
13000 noticias processadas...
14000 noticias processadas...
15000 noticias processadas...
16000 noticias processadas...
17000 noticias processadas...
18000 noticias processadas...
19000 noticias processadas...
20000 noticias processadas...
21000 noticias processadas...
22000 noticias processadas...
23000 noticias processadas...
24000 noticias processadas...
25000 noticias processadas...


In [10]:
#Quantidade de notícias
len(dataset)

4481

In [11]:
#Resultado esperado para cada notícia
import numpy as np
np.array(target)

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [12]:
#Tamanho do Vocabulário
len(vocabulario)

95666

In [13]:
vocabulario

{'videos': 353,
 'civilians': 632,
 'killed': 1460,
 'single': 850,
 'airstrike': 96,
 'identified': 414,
 'rate': 912,
 'american': 4999,
 'airstrikes': 209,
 'afghanistan': 436,
 'higher': 943,
 'engaged': 210,
 'active': 446,
 'combat': 342,
 'operations': 477,
 'photo': 351,
 'hellfire': 40,
 'missiles': 155,
 'loaded': 104,
 'military': 2300,
 'reaper': 9,
 'drone': 163,
 'staff': 946,
 'sgt': 62,
 'brian': 155,
 'fergusonus': 1,
 'air': 1132,
 'force': 1222,
 'bureau': 263,
 'identify': 300,
 'strike': 365,
 'month': 1626,
 '–': 3214,
 'biggest': 648,
 'loss': 693,
 'civilian': 303,
 'life': 3431,
 'attack': 1729,
 'medecins': 1,
 'sans': 10,
 'frontieres': 1,
 'hospital': 1862,
 'msf': 35,
 'october': 1018,
 'claimed': 508,
 'conducted': 335,
 'counterterrorism': 56,
 'islamic': 826,
 'fighters': 350,
 'hit': 873,
 'nangarhar': 17,
 'province': 237,
 'september': 543,
 'day': 3853,
 'united': 4639,
 'nations': 1022,
 'issued': 513,
 'unusually': 63,
 'rapid': 121,
 'strong': 790

In [47]:
import json

with open("vocabulario.json", "w") as write_file:
    json.dump(vocabulario, write_file, indent=4)

#Extrair as informações úteis (Características)

In [None]:
""" Exemplo de como os dados devem estar organizados
1 ola
1 meninas
1 como
1 estao

1 ola
1 meninos
1 como
1 estao

#VOCABULÁRIO
VOCAB ola meninas meninos como estao
NOT_1  1     1       0      1    1
NOT_2  1     0       1      1    1
"""
#

In [14]:
bagOfWords_aux = vocabulario.copy() 
for key in bagOfWords_aux.keys():
  bagOfWords_aux[key] = 0

features = []
for idx,noticia in enumerate(dataset):
  #Definindo o dicionario de contagem de cada noticia (Começando em zero)
  bagOfWords = bagOfWords_aux.copy() 
  for key in bagOfWords.keys():
    bagOfWords[key] = 0
  
  #Contagem de Palavras (Bag of Words)
  for palavra in noticia:
    bagOfWords[palavra] += 1
  
  #Salva o vetor da noticia atual
  features.append(bagOfWords)

  if idx % 500 == 0:
        print(idx, "noticias processadas...")

0 noticias processadas...
500 noticias processadas...
1000 noticias processadas...
1500 noticias processadas...
2000 noticias processadas...
2500 noticias processadas...
3000 noticias processadas...
3500 noticias processadas...
4000 noticias processadas...


##Usando TFIDF para reduzir a dimensionalidade dos dados

In [15]:
features[1].values()

dict_values([0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 7, 1, 1, 1, 3, 2, 3, 1, 3, 16, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 6, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 5, 1, 2, 5, 1, 1, 2, 3, 1, 1, 1, 11, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3, 5, 5, 2, 1, 1, 1, 1, 3, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 3, 8, 4, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 3, 2, 3, 1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 2, 1, 6, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 2, 1, 1, 

#Treinar o modelo de IA [(Referência)](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning)

In [16]:
X = [ np.array(list(f.values())) for f in features]
X[0:5]

[array([ 1,  8, 12, ...,  0,  0,  0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0])]

In [17]:
y = np.array(target)
y[0:5]

array([1, 0, 0, 1, 0], dtype=int64)

In [34]:
#Separa 90% dos dados para treinar
X_train = X[0:4033]
y_train = y[0:4033]
#Separa 10% dos dados para testar
X_test = X[4033:]
y_test = y[4033:]

In [42]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(8, 4, 2), random_state=42)
model.fit(X_train, y_train)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(8, 4, 2), random_state=42,
              solver='lbfgs')

In [43]:
resposta = model.predict(X_test[0:1])
if resposta == 1:
  print("É FAKE!")
else:
  print("OK")

gabarito = y_test[0:1]
if resposta == gabarito:
  print("ACERTOU")
else:
  print("ERROU")

É FAKE!
ACERTOU


#Testar esse modelo de IA (Dados de Treinamento)

In [44]:
resultado = model.score(X_train, y_train) * 100
print(f"Acerto: {resultado:.2f} %")

Acerto: 99.28 %


#Próxima reunião

#Exportar o modelo de IA treinado para a Extensão do Navegador

In [45]:
import pickle

# save the model to disk
filename = 'trained_model_MLP.sav'
pickle.dump(model, open(filename, 'wb'))

In [46]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test) * 100
print(f"Acerto: {result:.2f} %")

Acerto: 68.08 %


#Anotações interessantes:


1.   Erros ortográficos (sintáticos e semânticos) podem ser característicos de notíficias falsas.

