#Instalação das bibliotecas necessárias

In [1]:
pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [2]:
import gdown

#Realizando download dos dados do Google Drive
!gdown --id '1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef' #id do link de compartilhamento (Notícias)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef
From (redirected): https://drive.google.com/uc?id=1u0kq8pWWgUO30XCxhWKsaLtBVwCUA1Ef&confirm=t&uuid=1cfdaed5-6928-4308-8e24-040008a5c68e
To: /content/fake-news.zip
100% 48.7M/48.7M [00:01<00:00, 36.3MB/s]


In [3]:
#Descompactando dataset para a raiz do notebook
!unzip /content/fake-news.zip

Archive:  /content/fake-news.zip
  inflating: submit.csv              
  inflating: test.csv                
  inflating: train.csv               


#Leitura da base de dados

In [2]:
import pandas as pd

Descrição da base de dados

1.   submit.csv -> armazena as 'rótulos' de "Verdadeiro ou False"
2.   train.csv -> contém as informações para o treinamento do algoritmo.
3.   test.csv -> contém as informações para o teste do algoritmo.



In [5]:
#Obtendo uma noticia de entrada para ser verificada com o modelo já treinado
curr_news_text  = pd.read_csv("train.csv").iloc[0]["text"]
curr_news_label = pd.read_csv("train.csv").iloc[0]["label"]
curr_news_text, curr_news_label

('House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) \nWith apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. \nAs we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing ema

#Pré-processamento de texto

##Definindo Lista de palavras a serem removidas

In [7]:
#Carregamento de uma lista de StopWords
!gdown --id '1vETI4GzaYU0UbTV7VwV1WBk4ZH3AwEQV' #id do link de compartilhamento (Lista de Palavras a serem removidas)

Downloading...
From: https://drive.google.com/uc?id=1vETI4GzaYU0UbTV7VwV1WBk4ZH3AwEQV
To: /content/stop-word-list.txt
100% 4.86k/4.86k [00:00<00:00, 7.70MB/s]


In [6]:
stop_words = pd.read_csv("stop-word-list.txt", header=None)
stop_words[0:5], len(stop_words)

(       0
 0      a
 1   able
 2  about
 3  above
 4   abst,
 670)

In [7]:
remove_words = []
for i in range(len(stop_words)):
  palavra = stop_words[0].iloc[i]
  remove_words.append(palavra)

In [8]:
import json

#Carrega o vocabulário de TODAS as noticias
vocabulario = None
with open("vocabulario.json", "r") as json_file:
    vocabulario = json.load(json_file)

vocabulario

{'videos': 353,
 'civilians': 632,
 'killed': 1460,
 'single': 850,
 'airstrike': 96,
 'identified': 414,
 'rate': 912,
 'american': 4999,
 'airstrikes': 209,
 'afghanistan': 436,
 'higher': 943,
 'engaged': 210,
 'active': 446,
 'combat': 342,
 'operations': 477,
 'photo': 351,
 'hellfire': 40,
 'missiles': 155,
 'loaded': 104,
 'military': 2300,
 'reaper': 9,
 'drone': 163,
 'staff': 946,
 'sgt': 62,
 'brian': 155,
 'fergusonus': 1,
 'air': 1132,
 'force': 1222,
 'bureau': 263,
 'identify': 300,
 'strike': 365,
 'month': 1626,
 '–': 3214,
 'biggest': 648,
 'loss': 693,
 'civilian': 303,
 'life': 3431,
 'attack': 1729,
 'medecins': 1,
 'sans': 10,
 'frontieres': 1,
 'hospital': 1862,
 'msf': 35,
 'october': 1018,
 'claimed': 508,
 'conducted': 335,
 'counterterrorism': 56,
 'islamic': 826,
 'fighters': 350,
 'hit': 873,
 'nangarhar': 17,
 'province': 237,
 'september': 543,
 'day': 3853,
 'united': 4639,
 'nations': 1022,
 'issued': 513,
 'unusually': 63,
 'rapid': 121,
 'strong': 790

##Processando texto

In [9]:
import re
import string
dataset = [] #Armazenar as notícias processadas
target = [] #Armazena as classes de cada notícia (1: Fake, 0: Not-Fake)


#Padronização: converter todas as palavras para minúsculo.
texto = curr_news_text.lower()
    
#Remoção de Pontuação e Caracteres Especiais
texto = texto.translate(str.maketrans('', '', string.punctuation))
texto = re.sub(r'[.,"\'-?:!;“”]', '', texto)

#Remoção de Números
texto = re.sub('[0-9]', '', texto)

#Tokenização: Converter um texto em várias palavras separadas.
noticia_atual = texto.split()

noticia_processada = []
n_palavras = len(noticia_atual)
for indice in range(n_palavras):
  palavra_atual = noticia_atual[indice]
  
  #Desconsidera as 'Stopwords': palavras que são menos importantes.
  if palavra_atual not in remove_words:
    noticia_processada.append(palavra_atual)

#Armazena a noticia processada
dataset.append(noticia_processada)
target.append(curr_news_label)

In [11]:
print(dataset, target)

[['house', 'dem', 'aide', 'didn’t', 'comey’s', 'letter', 'jason', 'chaffetz', 'tweeted', 'darrell', 'lucus', 'october', 'subscribe', 'jason', 'chaffetz', 'stump', 'american', 'fork', 'utah', 'image', 'courtesy', 'michael', 'jolley', 'creative', 'commonsby', 'license', 'apologies', 'keith', 'olbermann', 'doubt', 'worst', 'person', 'week–fbi', 'director', 'james', 'comey', 'house', 'democratic', 'aide', 'secondworst', 'person', 'well', 'turns', 'comey', 'nowinfamous', 'letter', 'announcing', 'fbi', 'emails', 'hillary', 'clinton’s', 'email', 'server', 'ranking', 'democrats', 'relevant', 'committees', 'didn’t', 'hear', 'comey', 'tweet', 'republican', 'committee', 'chairmen', 'comey', 'notified', 'republican', 'chairmen', 'democratic', 'ranking', 'members', 'house', 'intelligence', 'judiciary', 'oversight', 'committees', 'agency', 'reviewing', 'emails', 'discovered', 'order', 'contained', 'classified', 'information', 'long', 'letter', 'oversight', 'committee', 'chairman', 'jason', 'chaffetz

In [12]:
#Resultado esperado para cada notícia
import numpy as np
np.array(target)

array([1], dtype=int64)

#Extrair as informações úteis (Características)

In [15]:
bagOfWords_aux = vocabulario.copy() 
for key in bagOfWords_aux.keys():
  bagOfWords_aux[key] = 0

features = []
for idx,noticia in enumerate(dataset):
  #Definindo o dicionario de contagem de cada noticia (Começando em zero)
  bagOfWords = bagOfWords_aux.copy() 
  for key in bagOfWords.keys():
    bagOfWords[key] = 0
  
  #Contagem de Palavras (Bag of Words)
  for palavra in noticia:
    if palavra in bagOfWords:
      bagOfWords[palavra] += 1
  
  #Salva o vetor da noticia atual
  features.append(bagOfWords)

In [16]:
features

[{'videos': 0,
  'civilians': 0,
  'killed': 0,
  'single': 0,
  'airstrike': 0,
  'identified': 0,
  'rate': 0,
  'american': 1,
  'airstrikes': 0,
  'afghanistan': 0,
  'higher': 0,
  'engaged': 0,
  'active': 0,
  'combat': 0,
  'operations': 0,
  'photo': 0,
  'hellfire': 0,
  'missiles': 0,
  'loaded': 0,
  'military': 0,
  'reaper': 0,
  'drone': 0,
  'staff': 0,
  'sgt': 0,
  'brian': 0,
  'fergusonus': 0,
  'air': 0,
  'force': 0,
  'bureau': 0,
  'identify': 0,
  'strike': 0,
  'month': 0,
  '–': 0,
  'biggest': 0,
  'loss': 0,
  'civilian': 0,
  'life': 0,
  'attack': 0,
  'medecins': 0,
  'sans': 0,
  'frontieres': 0,
  'hospital': 0,
  'msf': 0,
  'october': 2,
  'claimed': 0,
  'conducted': 0,
  'counterterrorism': 0,
  'islamic': 0,
  'fighters': 0,
  'hit': 0,
  'nangarhar': 0,
  'province': 0,
  'september': 0,
  'day': 0,
  'united': 0,
  'nations': 0,
  'issued': 0,
  'unusually': 0,
  'rapid': 0,
  'strong': 0,
  'statement': 0,
  'injured': 0,
  'gathered': 0,
  'ho

##Usando TFIDF para reduzir a dimensionalidade dos dados

In [17]:
features[0].values()

dict_values([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

#Treinar o modelo de IA [(Referência)](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning)

In [18]:
X = [ np.array(list(f.values())) for f in features]
X

[array([0, 0, 0, ..., 0, 0, 0])]

In [19]:
y = np.array(target)
y

array([1], dtype=int64)

In [21]:
import pickle
# load the model from disk
filename = "trained_model_MLP.sav"
loaded_model = pickle.load(open(filename, 'rb'))

In [25]:
resposta = loaded_model.predict(X)
if resposta == 1:
  print("É FAKE!")
else:
  print("OK")

gabarito = y
if resposta == gabarito:
  print("ACERTOU")
else:
  print("ERROU")

É FAKE!
ACERTOU
