## Part 1: Setup & Preprocessing

In [47]:
# Standard Libraries
import pandas as pd
import numpy as np
import json

# Data Preprocessing & NLP
import nltk
import re
import string
import gensim
from textblob import Word

import xgboost as xgb
from xgboost import XGBClassifier

# Importações necessárias para BERT
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import linear_kernel

# Performance metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
sns.set()
%matplotlib inline

import argparse

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
df = pd.read_csv('db/FakeReal.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Index   2544 non-null   int64 
 1   Título  2544 non-null   object
 2   Textos  2544 non-null   object
 3   check   2544 non-null   object
dtypes: int64(1), object(3)
memory usage: 79.6+ KB


In [4]:
df['check'].value_counts()

Unnamed: 0_level_0,count
check,Unnamed: 1_level_1
fato,1418
fake,1126


In [5]:
# Associate Category names with numerical index and save it in new column category_id
df['check_id'] = df['check'].factorize()[0]
df.head()

Unnamed: 0,Index,Título,Textos,check,check_id
0,0,é fake que vídeo mostre criminosos deixando de...,justiça determina devolução de armas de grosso...,fake,0
1,1,é fake que mamografia aumenta o risco de cânce...,quanto mais mamografia você fizer mais risco d...,fake,0
2,2,é fake vídeo em que famosos usam camiseta com ...,adicione nosso número de whatsapp após adicion...,fake,0
3,3,é fake foto que mostra alexandre de moraes usa...,o juiz fake alexandre de moraes é um dos benef...,fake,0
4,4,é fake que vinagre de álcool é recomendado par...,eu sou o presidente do comitê da dengue aqui d...,fake,0


In [6]:
# Create a new pandas dataframe "category_id_df", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = df[['check', 'check_id']].drop_duplicates().sort_values('check_id')

In [7]:
# Create a dictionary ( python datastructure - like a lookup table) that
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['check_id', 'check']].values)

In [8]:
id_to_category

{0: 'fake', 1: 'fato'}

In [10]:
category_id_df

Unnamed: 0,check,check_id
0,fake,0
1126,fato,1


## Part 2. Cleaning

In [12]:
# Drop duplicate data
df.drop_duplicates(subset=['check', 'Título'], inplace=True)

In [13]:
# Data Cleaning
def clean_text(text):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    text = text.lower()

    return text

In [14]:
df.dropna(how='any',axis=0,inplace=True)
df.head()

Unnamed: 0,Index,Título,Textos,check,check_id
0,0,é fake que vídeo mostre criminosos deixando de...,justiça determina devolução de armas de grosso...,fake,0
1,1,é fake que mamografia aumenta o risco de cânce...,quanto mais mamografia você fizer mais risco d...,fake,0
2,2,é fake vídeo em que famosos usam camiseta com ...,adicione nosso número de whatsapp após adicion...,fake,0
3,3,é fake foto que mostra alexandre de moraes usa...,o juiz fake alexandre de moraes é um dos benef...,fake,0
4,4,é fake que vinagre de álcool é recomendado par...,eu sou o presidente do comitê da dengue aqui d...,fake,0


In [15]:
df.columns

Index(['Index', 'Título', 'Textos', 'check', 'check_id'], dtype='object')

In [17]:
# creating clean text feature
df['clean_text'] = df['Textos'].apply(clean_text).str.replace('bn bn ', '')

# creating clean text feature
df['clean_text'] = df['Textos'].apply(clean_text).str.replace(' bn ', '')

In [18]:
for i in df['Textos']:
    print(i, type(i))

justiça determina devolução de armas de grosso calibre a membros de organização criminosa que haviam sido apreendidas pela polícia esse é o brasil governado por bandidos dentro de todos os poderes aceitem o brasil acabou nós estamos saindo aqui da delegacia da polícia civil viemos pegar nossos armamentos que haviam sido apreendidos todos foram periciados né tá aqui ó graças a deus estamos com nossos equipamentos novamente graças a deus pessoal saindo aqui agora da delegacia da polícia civil todas as armas passaram por perícia estamos de volta com nossos equipamentos viu muito obrigado a todos aí pessoal já tem quanto tempo foi apreendida mano tem um mês foi dia de dezembro hoje é de fevereiro praticamente dois meses quase três meses dois meses né estamos aqui hoje graças a deus nossos equipamentos foram liberados tá passando por perícia né para poder periciado agora tudo documentado tudo de boa tem males que que vem por bem pô foi até melhor pra gente delegacia de polícia civil aqui el

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2528 entries, 0 to 2543
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Index       2528 non-null   int64 
 1   Título      2528 non-null   object
 2   Textos      2528 non-null   object
 3   check       2528 non-null   object
 4   check_id    2528 non-null   int64 
 5   clean_text  2528 non-null   object
dtypes: int64(2), object(4)
memory usage: 138.2+ KB


## Part 3: Heading to Machine Learning

In [20]:
# Lemmatization process
'''
Words in the third person are changed to first person and verbs in past and future tenses are changed into the present by the
lemmatization process.
'''
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    # tokenization to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []

    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    lem = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lem

In [21]:
nltk.download('stopwords')
stop_words_pt = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Defining a Count Vectorizer object
count_vec = CountVectorizer(stop_words=stop_words_pt, max_features=10000)
# Defining a TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(stop_words=stop_words_pt, ngram_range=(1, 2), tokenizer=tokenize_and_lemmatize, max_features=10000, use_idf=True)

features = tfidf_vec.fit_transform(df.clean_text).toarray() # Remaps the words in the 2225 articles in the text column of
                                                  # data frame into features (superset of words) with an importance assigned
                                                  # based on each words frequency in the document and across documents

labels = df.check_id                           # represents the category of each of the 2225 articles


In [23]:
#Get a feel of the features identified by tfidf
features.shape # How many features are there ?

(2528, 10000)

In [24]:
# Remember the dictionary created to map category names to a number ?
category_to_id.items()

dict_items([('fake', 0), ('fato', 1)])

In [25]:
# The sorted function Converts dictionary items into a (sorted) list.
# In subsequent steps - We will use this list to iterate over the categories
sorted(category_to_id.items())

[('fake', 0), ('fato', 1)]

In [26]:
df.to_csv('db/out_df.csv', index=False)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2528 entries, 0 to 2543
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Index       2528 non-null   int64 
 1   Título      2528 non-null   object
 2   Textos      2528 non-null   object
 3   check       2528 non-null   object
 4   check_id    2528 non-null   int64 
 5   clean_text  2528 non-null   object
dtypes: int64(2), object(4)
memory usage: 138.2+ KB


In [28]:
X = df.loc[:,'clean_text']
y = df.loc[:,'check_id']

In [29]:
# Basic validation: splitting the data 80-20-20 train/test
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.2, random_state=55)
#X_train, X_cv, y_train, y_cv = train_test_split(X,y,test_size=.25, random_state=10)


In [30]:
# Tf-Idf transformation
xtrain_tfidf = tfidf_vec.fit_transform(X_train)
xtest_tfidf = tfidf_vec.transform(X_test)
#xcv_tfidf = tfidf_vec.fit_transform(X_cv)
xtrain_tfidf.shape

(2022, 10000)

In [31]:
# Count Vectorizer transformation
xtrain_cv = count_vec.fit_transform(X_train)
xtest_cv = count_vec.transform(X_test)
xtrain_cv.shape

(2022, 10000)

## Part 4: Model Training and Evaluation

In [32]:
#create list of model and accuracy dicts
perform_list = []

In [34]:
def run_model(model_name, est_c, est_pnlty):
    mdl=''
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()
    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100)
    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB()
    elif model_name == 'Linear SVC':
        mdl = LinearSVC()
    elif model_name == 'BUM_MODEL_IMPLEMENTATION':
        mdl = LogisticRegression()
    elif model_name == 'KNeighborsClassifier':
        mdl = KNeighborsClassifier(n_neighbors=3)
    elif model_name == 'Logistic Regression GridSearchCV':
        mdl = LogisticRegression(C=est_c, penalty=est_pnlty)

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(xtrain_tfidf, y_train)
    y_pred = oneVsRest.predict(xtest_tfidf)

    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1-score   : {f1score}')

    # Add performance parameters to list
    perform_list.append(dict([
        ('Model', model_name),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2))
         ]))

In [35]:
run_model('Random Forest', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Random Forest: % 99.41
Precision : 0.9940711462450593
Recall    : 0.9940711462450593
F1-score   : 0.9940711462450593


In [36]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Logistic Regression: % 96.64
Precision : 0.9664031620553359
Recall    : 0.9664031620553359
F1-score   : 0.9664031620553359


In [37]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Multinomial Naive Bayes: % 97.63
Precision : 0.9762845849802372
Recall    : 0.9762845849802372
F1-score   : 0.9762845849802372


In [38]:
run_model('BUM_MODEL_IMPLEMENTATION', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic BUM_MODEL_IMPLEMENTATION: % 96.64
Precision : 0.9664031620553359
Recall    : 0.9664031620553359
F1-score   : 0.9664031620553359


In [39]:
run_model('Linear SVC', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Linear SVC: % 97.83
Precision : 0.9782608695652174
Recall    : 0.9782608695652174
F1-score   : 0.9782608695652174


In [40]:
run_model('KNeighborsClassifier', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic KNeighborsClassifier: % 97.63
Precision : 0.9762845849802372
Recall    : 0.9762845849802372
F1-score   : 0.9762845849802372


### Performance metrics of models

In [41]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Random Forest,99.41,0.99,0.99,0.99
1,Logistic Regression,96.64,0.97,0.97,0.97
2,Multinomial Naive Bayes,97.63,0.98,0.98,0.98
3,BUM_MODEL_IMPLEMENTATION,96.64,0.97,0.97,0.97
4,Linear SVC,97.83,0.98,0.98,0.98
5,KNeighborsClassifier,97.63,0.98,0.98,0.98


In [50]:
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['check_id'], test_size=0.2, random_state=42)

# Vetorização dos dados de treino e teste
xtrain_tfidf = tfidf_vec.fit_transform(X_train)
xtest_tfidf = tfidf_vec.transform(X_test)

# Treinar um modelo (ex: Logistic Regression)
model = LogisticRegression()
model.fit(xtrain_tfidf, y_train)

# Função para prever a probabilidade de ser Fake News
def predict_fake_news_probability(text, model, vectorizer):
    """
    Função para prever a probabilidade de um texto ser Fake News.

    :param text: Texto a ser analisado.
    :param model: Modelo treinado (ex: Logistic Regression, Random Forest, etc.).
    :param vectorizer: Vetorizador usado para transformar o texto em features (ex: TfidfVectorizer).
    :return: Probabilidade de ser Fake News.
    """
    # Limpeza do texto
    cleaned_text = clean_text(text)

    # Transformação do texto em features usando o vetorizador
    text_features = vectorizer.transform([cleaned_text])

    # Previsão da probabilidade
    probability = model.predict_proba(text_features)

    # Retorna a probabilidade de ser Fake News (assumindo que a classe 1 é Fake News)
    return probability[0][1]

In [55]:
# Função principal
def main():
    # Substitua o argparse por entradas interativas no Colab
    model_choice = input("Escolha o modelo (logistic_regression, random_forest, svm): ").strip().lower()
    text_to_analyze = input("Insira o texto a ser analisado: ").strip()

    # Carregar os dados (assumindo que já existe um DataFrame df)
    # Substitua 'data.csv' pelo caminho do seu arquivo de dados
    df = pd.read_csv('db/out_df.csv')

    # Dividir os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['check_id'], test_size=0.2, random_state=42)

    # Vetorização dos dados de treino e teste
    tfidf_vec = TfidfVectorizer()
    xtrain_tfidf = tfidf_vec.fit_transform(X_train)
    xtest_tfidf = tfidf_vec.transform(X_test)

    # Selecionar o modelo de treinamento
    if model_choice == 'logistic_regression':
        model = LogisticRegression()
    elif model_choice == 'random_forest':
        model = RandomForestClassifier()
    elif model_choice == 'svm':
        model = SVC(probability=True)
    else:
        raise ValueError("Modelo inválido. Escolha entre 'logistic_regression', 'random_forest' ou 'svm'.")

    # Treinar o modelo
    model.fit(xtrain_tfidf, y_train)

    # Previsão da probabilidade
    probabilidade_fake_news = predict_fake_news_probability(text_to_analyze, model, tfidf_vec)
    print(f"Probabilidade de ser Fake News: {probabilidade_fake_news:.2f}")

# Executar a função principal
if __name__ == "__main__":
    main()

Escolha o modelo (logistic_regression, random_forest, svm): logistic_regression
Insira o texto a ser analisado: URGENTE! Em uma decisão que está chocando o Brasil e o mundo, o presidente Lula acaba de anunciar um plano polêmico e assustador: a legalização da COCAÍNA para uso em crianças! Isso mesmo, você não leu errado! O governo federal está preparando um programa que vai distribuir doses controladas de cocaína para pais que queiram tratar problemas de saúde de seus filhos, como hiperatividade e falta de atenção
Probabilidade de ser Fake News: 0.70
