<p style="color:white;
          text-align:center;
          font-size:24px;">
Inicialização de Bibliotecas, Escolha de Modelos e Leitura do Dataset Principal</p>


In [1]:
import os
import ollama
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

models_names = ['qwen2:7b','phi3:14b','llama3:8b','gemma:7b','phi3','mxbai-embed-large','all-minilm','snowflake-arctic-embed2']
folder_models_names = [model.replace(':','_') for model in models_names]

try:
    os.makedirs(fr'Data')
except FileExistsError:
    pass
try:
    os.makedirs(fr'Results')
except FileExistsError:
    pass
try:
    os.rename(r'300-noticias-v2-filtradas.csv',r'Data\300-noticias-v2-filtradas.csv')
except FileNotFoundError:
    pass

seed = 7

data_frame = pd.read_csv(r'Data\300-noticias-v2-filtradas.csv')
data_frame

Unnamed: 0.1,Unnamed: 0,Fonte,ID,text,Tematica,Rótulo,Tipo_Dado,Texto_Original
0,0,G1,0,"A publicação é intitulada ""REVIRAVOLTA no FGTS...",Política,falso,Imagem,REVIRAVOLTA no FGTS 2024: Lula autorizou SAQUE...
1,2,G1,2,"Com voto de Dino, Primeira Turma do STF mantém...",Política,real,Texto,"Com voto de Dino, Primeira Turma do STF mantém..."
2,3,G1,3,Bolsonaro presta depoimento na PF e nega ter i...,Política,real,Texto,Bolsonaro presta depoimento na PF e nega ter i...
3,4,Aos Fatos,4,Governo Lula publica aumento ao Auxílio-Reclus...,Política,falso,Imagem,"Governo Lula ""publica aumento ao Auxílio-Reclu..."
4,5,G1,5,É #FAKE que Ministério da Justiça vai indeniza...,Política,falso,Vídeo,Noticia G1: Jovem tenta pular o muro de residê...
...,...,...,...,...,...,...,...,...
295,472,G1,472,Governo pratica congelamento não declarado de ...,Política,real,Texto,Governo pratica congelamento não declarado de ...
296,473,G1,473,"Sem citar Rússia, governo brasileiro condena b...",Política,real,Texto,"Sem citar Rússia, governo brasileiro condena b..."
297,474,G1,474,"Ao STF, PF diz que venda ilegal de joias foi p...",Política,real,Texto,"Ao STF, PF diz que venda ilegal de joias foi p..."
298,475,G1,475,Cerca de R$ 10 bilhões do pacote de corte de g...,Política,real,Texto,Cerca de R$ 10 bilhões do pacote de corte de g...


<p style="color:white;
          text-align:center;
          font-size:24px;">
Criação de Data Frames com Rótulos e Valores de Embedding de Cada Modelo</p>


In [None]:
for model_name in models_names:
    try:
        os.makedirs(fr'Data\{model_name.replace(':','_')}')
    except FileExistsError:
        pass

    len_model = len(((ollama.embed(model=model_name,input='Test'))['embeddings'])[0])

    vector = np.empty((300,len_model),dtype=np.float32)
    label = np.empty((300,1),dtype='U5')

    for index,rotulo in enumerate(data_frame['Rótulo']):
        label[index] = rotulo

    for index,news in enumerate(data_frame['Texto_Original']):
        embedding_valor = ollama.embed(model=model_name,input=news)
        vector[index] = embedding_valor['embeddings'][0]
        labelad_vector = np.concatenate((label,vector),axis=1)
    labelad_df = pd.DataFrame(labelad_vector)
    labelad_df = labelad_df.rename(columns={labelad_df.columns[0]: 'Rótulo'})
    labelad_df.to_csv(path_or_buf= fr'Data\{model_name.replace(':','_')}\Embembedding.csv',
                  header=True,
                  index=False,
                  encoding='utf-8')

<p style="color:white;
          text-align:center;
          font-size:24px;">
Criação de Gráficos 2D e 3D dos Embeddings de Cada Modelo</p>


In [None]:
boolean_map = np.vectorize(lambda z: z == 'real')

for model_name in folder_models_names:

    try:
        os.makedirs(fr'Results\{model_name}')
    except FileExistsError:
        pass

    vector =np.array((pd.read_csv(fr'Data\{model_name}\Embembedding.csv')).iloc[:,1:])
    label = np.array((pd.read_csv(fr'Data\{model_name}\Embembedding.csv')).iloc[:,0])

    tsne_2d = TSNE(n_components=2,random_state=seed).fit_transform(vector)
    tsne_3d = TSNE(n_components=3,random_state=seed).fit_transform(vector)
    tsne_3d_true = tsne_3d[boolean_map(label)==True]
    tsne_3d_false = tsne_3d[boolean_map(label)==False]
    

    plt.figure(figsize=(12, 8))

    plt.scatter(tsne_2d[:,0],
                tsne_2d[:,1], 
                c=boolean_map(label),
                cmap='coolwarm',
                alpha=1,
                edgecolors='w',
                s=100)
    plt.title(f'{model_name}',fontsize = 32)

    plt.savefig(fr'Results\{model_name}\2d.png', 
                   dpi=300,
                   bbox_inches='tight',
                   facecolor='white')

    plt.show()

    
    interactive_graph = go.Figure()

    interactive_graph.add_trace(go.Scatter3d(
        x=tsne_3d_true[:,0],
        y=tsne_3d_true[:,1],
        z=tsne_3d_true[:,2],
        mode='markers',
        marker=dict(size=4, color='blue', opacity=1),
        name='Real'
    ))

    interactive_graph.add_trace(go.Scatter3d(
        x=tsne_3d_false[:,0],
        y=tsne_3d_false[:,1],
        z=tsne_3d_false[:,2],
        mode='markers',
        marker=dict(size=4, color='red', opacity=1),
        name='Falso'
    ))


    interactive_graph.add_trace(go.Scatter3d(
    x=[-100,100],
    y=[0, 0],
    z=[0, 0],
    mode='lines',
    line=dict(color='black', width=5),
    name='Eixo X'
    ))

    interactive_graph.add_trace(go.Scatter3d(
        x=[0, 0],
        y=[-100,100],
        z=[0, 0],
        mode='lines',
        line=dict(color='black', width=5),
        name='Eixo Y'
    ))

    interactive_graph.add_trace(go.Scatter3d(
        x=[0, 0],
        y=[0, 0],
        z=[-100, 100],
        mode='lines',
        line=dict(color='black', width=5),
        name='Eixo Z'
    ))

    interactive_graph.update_layout(
    
    title={'text':f'<b>{model_name}</b>',
           'x': 0.5,
           'y': 0.95},
             title_font_size=32,
    
    scene=dict(
        xaxis=dict(visible=False),
        
        yaxis=dict(visible=False),

        zaxis=dict(visible=False)),
    paper_bgcolor='white',
    plot_bgcolor='white')

    interactive_graph.write_html(fr'Results\{model_name}\3d.html')

    interactive_graph.show()

<p style="color:white;
          text-align:center;
          font-size:24px;">
Separação de Dados</p>


In [None]:
np.random.seed(seed)

for model_name in folder_models_names:
    
    model_df = np.array(pd.read_csv(fr'Data\{model_name}\Embembedding.csv'))
    k_vector = np.empty((model_df.shape[0]//10,model_df.shape[1]-1),dtype=np.float32)
    k_label = np.empty((model_df.shape[0]//10,1),'U5')
    k_fold = np.empty(model_df.shape,dtype=object)

    for j in range(9):

        for i in range(k_fold.shape[0]//20):
            
            passkey = 0
            
            while passkey != 'real':
                random_value = np.random.randint(0,model_df.shape[0])
                k_vector[2*i] = model_df[random_value,1:]
                k_label[2*i] = model_df[random_value,0]
                passkey = model_df[random_value,0]
            
            model_df = np.delete(model_df,random_value,axis=0)

            while passkey != 'falso':
                random_value = np.random.randint(0,model_df.shape[0])
                k_vector[2*i + 1] = model_df[random_value,1:]
                k_label[2*i +1] = model_df[random_value,0]
                passkey = model_df[random_value,0]
            
            model_df = np.delete(model_df,random_value,axis=0)
        
        k_labelad = np.concatenate((k_label,k_vector),axis=1)
        np.random.shuffle(k_labelad)

        k_fold[j*30:(j+1)*30,:] = k_labelad
    
    k_fold[k_fold.shape[0] - model_df.shape[0]:k_fold.shape[0],:] = model_df
    
    k_fold = pd.DataFrame(k_fold)

    k_fold.to_csv(path_or_buf= fr'Data\{model_name}\K_fold.csv',
                header=True,
                index=False,
                encoding='utf-8')

<p style="color:white;
          text-align:center;
          font-size:24px;">
          Treinamento e Classificação</p>


In [None]:
forest_seed = np.random.randint(0,100,size=10)

for model_name in folder_models_names:

    k_fold_labels = np.empty((300,1),dtype='U5')
    
    for i in range(10):
        k_fold = np.array(pd.read_csv(fr'Data\{model_name}\K_fold.csv'))

        x_test = k_fold[30*i:30*(i+1),1:]
        y_test = k_fold[30*i:30*(i+1),0]
        
        k_fold = np.delete(k_fold,np.arange(30*i,30*(i+1)),axis=0)
        
        x_train = k_fold[:,1:]
        y_train = k_fold[:,0]
    
        RFC = (RandomForestClassifier(max_features=None,random_state=forest_seed[i])).fit(x_train,y_train)
    
        y_pred = ((RFC).predict(x_test)).reshape(30,1)
        k_fold_labels[30*i:30*(i+1),:] = y_pred
    
    k_fold_labels = pd.DataFrame(k_fold_labels)

    k_fold_labels.to_csv(path_or_buf= fr'Data\{model_name}\Prediction.csv',
                header=True,
                index=False,
                encoding='utf-8')

<p style="color:white;
          text-align:center;
          font-size:24px;">
          Matriz de Confusão e Métricas</p>


In [None]:
metrics_matrix = np.empty(shape=(len(folder_models_names),7))

for i,model_name in enumerate(folder_models_names):

    y_true = np.array(pd.read_csv(fr'Data\{model_name}\K_fold.csv'))[:,0]
    y_pred = np.array(pd.read_csv(fr'Data\{model_name}\Prediction.csv'))

    matrix = confusion_matrix(y_true=y_true,y_pred=y_pred)

    sns.heatmap(matrix,annot=True,fmt='d',cmap='Blues')
    plt.xlabel('Falso           (Previsto)           Real')
    plt.ylabel('Real           (Rótulo)           Falso')
    plt.title(f'{model_name}',fontsize = 24)
    plt.savefig(fr'Results\{model_name}\Confusion_Matrix.png', 
                dpi=300,
                bbox_inches='tight',
                facecolor='white')
    plt.show()

    fake_precision = matrix[0,0] / (matrix[0,0] + matrix[1,0])
    fake_recall = matrix[0,0] / (matrix[0,0] + matrix[0,1])
    fake_f1 = (2*fake_precision*fake_recall)/(fake_precision + fake_recall)

    true_precision = matrix[1,1] / (matrix[1,1] + matrix[0,1])
    true_recall = matrix[1,1] / (matrix[1,1] + matrix[1,0])
    true_f1 = (2*true_precision*true_recall) / (true_precision + true_recall)

    f1_macro = (fake_f1 + true_f1)/2 

    matrix_line = np.round(np.array([fake_precision,fake_recall,fake_f1,true_precision,true_recall,true_f1,f1_macro]).reshape(1,-1),2)

    metrics_matrix[i] = matrix_line

metrics_matrix = np.round(metrics_matrix,2)

metrics_labels = np.array([['Models',' ','Fake','  ','   ','True','    ','f₁ - Macros'],
                        [' ','p','r','f₁','p','r','f₁','  ']])
metrics_matrix = np.concatenate((np.array(models_names).reshape(len(models_names),1),metrics_matrix),axis=1)
metrics_matrix = np.concatenate((metrics_labels,metrics_matrix),axis=0)
metrics_data_frame = pd.DataFrame(metrics_matrix)

metrics_data_frame.to_csv(path_or_buf=r'Results\Metric_Matrix.csv',
                        index=False,
                        header=False,
                        encoding='utf-8')

metrics_data_frame
