In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re
import string
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay, classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
import pickle
import nltk
nltk.download('stopwords')

In [None]:
df = pd.read_csv('../data/test.csv')
df = df.drop(columns=['Unnamed: 0'])
df.isnull().any()

In [None]:
stop_words = set(stopwords.words('portuguese'))

def remover_stop_words(news):
    palavras = news.split()
    palavras_sem_stop = [palavra for palavra in palavras if palavra.lower() not in stop_words]
    return ' '.join(palavras_sem_stop)

def review_cleaning(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df["preprocessed_news"] = df["preprocessed_news"].apply(remover_stop_words)
df["preprocessed_news"] = df["preprocessed_news"].apply(review_cleaning)

In [None]:
df['label'] = df.apply(lambda row: 0 if row.label == 'fake' else 1, axis=1)

X = df.drop(['label'], axis = 1)
Y = df['label']

In [None]:
from sklearn.model_selection import train_test_split
X = X['preprocessed_news'].apply(lambda x: x.lower())



tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X.values)
train_word_index = tokenizer.word_index
vocab_length = len(train_word_index) + 1
test_sequences = tokenizer.texts_to_sequences(X)
test_padded_seqeunces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=256, truncating='post')


vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(X)
X = vectorizer.transform(X).toarray()

##### Logistic Regression


In [None]:
with open('../models/logisticRegression.pkl', 'rb') as arquivo:
    lr_classifier = pickle.load(arquivo)


y_test_pred = lr_classifier.predict(X)
y_test_pred = (y_test_pred > 0.75)
lr_classifier_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)
cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()



##### Multilayer perceptron (MLP)

In [None]:
with open('../models/MLPClassifier.pkl', 'rb') as arquivo:
    mlp = pickle.load(arquivo)


y_test_pred = mlp.predict(X)
y_test_pred = (y_test_pred > 0.75)
mlp_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)

cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()


##### Multilayer perceptron (MLP) Com GridSearchCV

In [None]:
with open('../models/MLPClassifierWithGridSearchCV.pkl', 'rb') as arquivo:
    clf = pickle.load(arquivo)

y_test_pred = clf.predict(X)
y_test_pred = (y_test_pred > 0.75)
mlpG_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)

cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()

    

#### Decision Tree

In [None]:
with open('../models/DecisionTreeClassifier.pkl', 'rb') as arquivo:
    decisionTree = pickle.load(arquivo)

y_test_pred = decisionTree.predict(X)
y_test_pred = (y_test_pred > 0.75)
decisionTree_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)

cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()


#### Passive Aggressive

In [None]:
with open('../models/PassiveAggressiveClassifier.pkl', 'rb') as arquivo:
    passive = pickle.load(arquivo)

x_test_pred = passive.predict(X)
y_test_pred = (y_test_pred > 0.75)
passive_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)

cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()


RNN

In [None]:



print(f'Vocab length {vocab_length}, Train sequences {len(test_sequences)}, Test sequences {len(test_sequences)} ')

In [None]:
y_train_pred.shape

In [None]:
from tensorflow.keras.models import load_model

rnn = load_model('../models/modelo_rnn.h5')
y_test_pred = rnn.predict(test_padded_seqeunces)
y_test_pred = (y_train_pred > 0.75)
rnn_acc = round(accuracy_score(y_test_pred, Y) * 100, 2)
Y=Y[:5000]
cm = confusion_matrix(Y, y_test_pred)
print(classification_report(y_test_pred,Y))

ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake','True']).plot()


### Comparando Modelos Diferentes

In [None]:

models = pd.DataFrame({
    'Model': [
        'Logistic Regression',
        'Decision Tree',
        'MLPClassifier',
        'MLPClassifier with GridSearchCV',
        'RNN',
        'PassiveAggressiveClassifier'
    ],
    'Model Accuracy Score': [
        lr_classifier_acc, decisionTree_acc,
        mlp_acc,mlpG_acc,rnn_acc, passive_acc
    ]
})

In [None]:
models_sorted = models.sort_values(by='Model Accuracy Score', ascending=False)

models_sorted = models_sorted.reset_index(drop=True)

styled_df = models_sorted.style.background_gradient(cmap='coolwarm')

styled_df.set_properties(**{
    'font-family': 'Lucida Calligraphy',
    'color': 'LightGreen',
    'font-size': '15px'
})