## Part 1: Setup & Preprocessing

In [47]:
# Standard Libraries
import pandas as pd
import numpy as np
import json

# Data Preprocessing & NLP
import nltk
import re
import string
import gensim
from textblob import Word

import xgboost as xgb
from xgboost import XGBClassifier

# Importações necessárias para BERT
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import linear_kernel

# Performance metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to C:\Users\Polga-
[nltk_data]     Fe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Polga-
[nltk_data]     Fe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Polga-
[nltk_data]     Fe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Polga-
[nltk_data]     Fe/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [48]:
df = pd.read_csv('db/FakeReal.csv')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4563 entries, 0 to 4562
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    4563 non-null   int64  
 1   titulo   4563 non-null   object 
 2   URL      4563 non-null   object 
 3   index.1  4563 non-null   float64
 4   resumo   4563 non-null   object 
 5   check    4563 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 214.0+ KB


In [50]:
df['check'].value_counts()

check
fake    2775
fato    1788
Name: count, dtype: int64

In [51]:
# Associate Category names with numerical index and save it in new column category_id
df['check_id'] = df['check'].factorize()[0]
df.head()

Unnamed: 0,index,titulo,URL,index.1,resumo,check,check_id
0,0,É #FAKE vídeo de Paolla Oliveira anunciando de...,https://g1.globo.com/fato-ou-fake/video/e-fake...,0.0,é fake post que usa vídeo de paolla oliveira p...,fake,0
1,1,É #FAKE que vídeo mostre criminosos deixando d...,https://g1.globo.com/fato-ou-fake/noticia/2025...,1.0,polícia civil da bahia onde foi feita a gravaç...,fake,0
2,2,É #FATO: vídeo viral mostra treinamento milita...,https://g1.globo.com/fato-ou-fake/noticia/2025...,2.0,publicada em fevereiro de na conta oficial de ...,fato,1
3,3,É #FAKE que mamografia aumenta o risco de cânc...,https://g1.globo.com/fato-ou-fake/noticia/2025...,3.0,integrante da sociedade brasileira de mastolog...,fake,0
4,4,É #FAKE vídeo em que famosos usam camiseta com...,https://g1.globo.com/fato-ou-fake/noticia/2025...,4.0,autor do registro é criador de conteúdos que u...,fake,0


In [52]:
# Create a new pandas dataframe "category_id_df", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = df[['check', 'check_id']].drop_duplicates().sort_values('check_id')

In [53]:
# Create a dictionary ( python datastructure - like a lookup table) that
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['check_id', 'check']].values)

In [54]:
id_to_category

{0: 'fake', 1: 'fato'}

In [55]:
category_id_df

Unnamed: 0,check,check_id
0,fake,0
2,fato,1


## Part 2. Cleaning

In [56]:
# Drop duplicate data
df.drop_duplicates(subset=['check', 'titulo'], inplace=True)

In [57]:
# Data Cleaning
def clean_text(text):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    text = text.lower()

    return text

In [58]:
df.dropna(how='any',axis=0,inplace=True)
df.head()

Unnamed: 0,index,titulo,URL,index.1,resumo,check,check_id
0,0,É #FAKE vídeo de Paolla Oliveira anunciando de...,https://g1.globo.com/fato-ou-fake/video/e-fake...,0.0,é fake post que usa vídeo de paolla oliveira p...,fake,0
1,1,É #FAKE que vídeo mostre criminosos deixando d...,https://g1.globo.com/fato-ou-fake/noticia/2025...,1.0,polícia civil da bahia onde foi feita a gravaç...,fake,0
2,2,É #FATO: vídeo viral mostra treinamento milita...,https://g1.globo.com/fato-ou-fake/noticia/2025...,2.0,publicada em fevereiro de na conta oficial de ...,fato,1
3,3,É #FAKE que mamografia aumenta o risco de cânc...,https://g1.globo.com/fato-ou-fake/noticia/2025...,3.0,integrante da sociedade brasileira de mastolog...,fake,0
4,4,É #FAKE vídeo em que famosos usam camiseta com...,https://g1.globo.com/fato-ou-fake/noticia/2025...,4.0,autor do registro é criador de conteúdos que u...,fake,0


In [59]:
df.columns

Index(['index', 'titulo', 'URL', 'index.1', 'resumo', 'check', 'check_id'], dtype='object')

In [60]:
# creating clean text feature
df['clean_text'] = df['resumo'].apply(clean_text).str.replace('bn bn ', '')

# creating clean text feature
df['clean_text'] = df['resumo'].apply(clean_text).str.replace(' bn ', '')

In [61]:
for i in df['resumo']:
    print(i, type(i))

é fake post que usa vídeo de paolla oliveira para anunciar desconto de em camarote no carnaval tratase de golpe <class 'str'>
polícia civil da bahia onde foi feita a gravação explicou ao g que objetos são do tipo airsoft e simulacros os dois homens que aparecem no registro são criadores de conteúdo e não membros de organização criminosas como dizem publicações falsas <class 'str'>
publicada em fevereiro de na conta oficial de uma companhia de treinamento de fuzileiros navais gravação voltou a circular no início deste ano no registro homem com pés amarrados usa boca para resgatar máscara no fundo de uma piscina <class 'str'>
integrante da sociedade brasileira de mastologia sbm desmentiu afirmações de que o exame de detecção amplia chance de pacientes terem a doença <class 'str'>
autor do registro é criador de conteúdos que usam inteligência artificial e ferramentas de checagem apontaram o uso desse recurso além disso o instagram sinalizou que vídeo é falso e a atriz scarlett johansson p

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4122 entries, 0 to 4562
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       4122 non-null   int64  
 1   titulo      4122 non-null   object 
 2   URL         4122 non-null   object 
 3   index.1     4122 non-null   float64
 4   resumo      4122 non-null   object 
 5   check       4122 non-null   object 
 6   check_id    4122 non-null   int64  
 7   clean_text  4122 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 289.8+ KB


## Part 3: Heading to Machine Learning

In [63]:
# Lemmatization process
'''
Words in the third person are changed to first person and verbs in past and future tenses are changed into the present by the
lemmatization process.
'''
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    # tokenization to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []

    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    lem = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lem

In [64]:
nltk.download('stopwords')
stop_words_pt = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to C:\Users\Polga-
[nltk_data]     Fe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
# Defining a Count Vectorizer object
count_vec = CountVectorizer(stop_words=stop_words_pt, max_features=10000)
# Defining a TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(stop_words=stop_words_pt, ngram_range=(1, 2), tokenizer=tokenize_and_lemmatize, max_features=10000, use_idf=True)

features = tfidf_vec.fit_transform(df.clean_text).toarray() # Remaps the words in the 2225 articles in the text column of
                                                  # data frame into features (superset of words) with an importance assigned
                                                  # based on each words frequency in the document and across documents

labels = df.check_id                           # represents the category of each of the 2225 articles


In [66]:
#Get a feel of the features identified by tfidf
features.shape # How many features are there ?

(4122, 10000)

In [67]:
# Remember the dictionary created to map category names to a number ?
category_to_id.items()

dict_items([('fake', 0), ('fato', 1)])

In [68]:
# The sorted function Converts dictionary items into a (sorted) list.
# In subsequent steps - We will use this list to iterate over the categories
sorted(category_to_id.items())

[('fake', 0), ('fato', 1)]

In [69]:
df.to_csv('db/out_df.csv', index=False)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4122 entries, 0 to 4562
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       4122 non-null   int64  
 1   titulo      4122 non-null   object 
 2   URL         4122 non-null   object 
 3   index.1     4122 non-null   float64
 4   resumo      4122 non-null   object 
 5   check       4122 non-null   object 
 6   check_id    4122 non-null   int64  
 7   clean_text  4122 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 289.8+ KB


In [71]:
X = df.loc[:,'clean_text']
y = df.loc[:,'check_id']

In [72]:
# Basic validation: splitting the data 80-20-20 train/test
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.2, random_state=55)
#X_train, X_cv, y_train, y_cv = train_test_split(X,y,test_size=.25, random_state=10)


In [73]:
# Tf-Idf transformation
xtrain_tfidf = tfidf_vec.fit_transform(X_train)
xtest_tfidf = tfidf_vec.transform(X_test)
#xcv_tfidf = tfidf_vec.fit_transform(X_cv)
xtrain_tfidf.shape

(3297, 10000)

In [74]:
# Count Vectorizer transformation
xtrain_cv = count_vec.fit_transform(X_train)
xtest_cv = count_vec.transform(X_test)
xtrain_cv.shape

(3297, 9936)

## Part 4: Model Training and Evaluation

In [75]:
#create list of model and accuracy dicts
perform_list = []

In [76]:
def run_model(model_name, est_c, est_pnlty):
    mdl=''
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression()
    elif model_name == 'Random Forest':
        mdl = RandomForestClassifier(n_estimators=100)
    elif model_name == 'Multinomial Naive Bayes':
        mdl = MultinomialNB()
    elif model_name == 'Linear SVC':
        mdl = LinearSVC()       
    elif model_name == 'BUM_MODEL_IMPLEMENTATION':
        mdl = LogisticRegression()   
    elif model_name == 'KNeighborsClassifier':
        mdl = KNeighborsClassifier(n_neighbors=3)
    elif model_name == 'Logistic Regression GridSearchCV':
        mdl = LogisticRegression(C=est_c, penalty=est_pnlty)

    oneVsRest = OneVsRestClassifier(mdl)
    oneVsRest.fit(xtrain_tfidf, y_train)
    y_pred = oneVsRest.predict(xtest_tfidf)

    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')

    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1-score   : {f1score}')

    # Add performance parameters to list
    perform_list.append(dict([
        ('Model', model_name),
        ('Test Accuracy', round(accuracy, 2)),
        ('Precision', round(precision, 2)),
        ('Recall', round(recall, 2)),
        ('F1', round(f1score, 2))
         ]))

In [77]:
run_model('Random Forest', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Random Forest: % 90.67
Precision : 0.9066666666666666
Recall    : 0.9066666666666666
F1-score   : 0.9066666666666666


In [78]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Logistic Regression: % 91.64
Precision : 0.9163636363636364
Recall    : 0.9163636363636364
F1-score   : 0.9163636363636364


In [79]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Multinomial Naive Bayes: % 87.03
Precision : 0.8703030303030304
Recall    : 0.8703030303030304
F1-score   : 0.8703030303030304


In [80]:
run_model('BUM_MODEL_IMPLEMENTATION', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic BUM_MODEL_IMPLEMENTATION: % 91.64
Precision : 0.9163636363636364
Recall    : 0.9163636363636364
F1-score   : 0.9163636363636364


In [81]:
run_model('Linear SVC', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic Linear SVC: % 92.73
Precision : 0.9272727272727272
Recall    : 0.9272727272727272
F1-score   : 0.9272727272727272


In [82]:
run_model('KNeighborsClassifier', est_c=None, est_pnlty=None)

Test Accuracy Score of Basic KNeighborsClassifier: % 84.61
Precision : 0.8460606060606061
Recall    : 0.8460606060606061
F1-score   : 0.8460606060606061


### Performance metrics of models

In [83]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Random Forest,90.67,0.91,0.91,0.91
1,Logistic Regression,91.64,0.92,0.92,0.92
2,Multinomial Naive Bayes,87.03,0.87,0.87,0.87
3,BUM_MODEL_IMPLEMENTATION,91.64,0.92,0.92,0.92
4,Linear SVC,92.73,0.93,0.93,0.93
5,KNeighborsClassifier,84.61,0.85,0.85,0.85
