In [1]:
%%capture
# imports
!pip install datasets
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
!pip install deslib
from deslib.util.diversity import double_fault
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer, CountVectorizer
!pip install 'umap-learn==0.3.10'
!pip install umap-learn[plot]
from umap.umap_ import UMAP

# Técnicas de extração de características
cv = CountVectorizer(analyzer='word', lowercase=True, stop_words='english')

# dataset liar
dataset_liar = load_dataset('liar')
liar_train, liar_test, liar_val = pd.DataFrame(dataset_liar['train']), pd.DataFrame(dataset_liar['test']), pd.DataFrame(dataset_liar['validation'])

# dataset sentimental liar (senti)
senti_train, senti_test, senti_val = liar_train.copy(), liar_test.copy(), liar_val.copy()
senti_labels={5:0, 4:0, 3:1, 1:0,2:1,0:0} # de acordo com proposta do conjunto de dados Sentimental Liar em [1].
senti_train['label'], senti_test['label'], senti_val['label'] = senti_train['label'].map(senti_labels), senti_test['label'].map(senti_labels), senti_val['label'].map(senti_labels)

# funções para geração dos gráficos
method = UMAP(n_neighbors=2, metric='euclidean', random_state=123456987, min_dist=0.7, n_components=2,)

def plot_diversity_6(D_tilde,dataset_name,n_labels,embedding): # 6 labels
    s = 100
    colors, markers = {}, {}
    colors[0], colors[1], colors[2], colors[3], colors[4], colors[5] = 'gray', 'lightblue', 'lightslategray', 'darkslategray', 'midnightblue', 'black'
    markers[0], markers[1], markers[2], markers[3], markers[4], markers[5] = 's', 'H', 'd', 'P', '*', 'X'

    n_instances, n_labels, _ = D_tilde.shape
    plt.figure(figsize=(15,10))
    for idx in range(n_instances):
        for idx2 in range(n_labels):
            x, y = D_tilde[idx,idx2, 0], D_tilde[idx,idx2, 1]
            plt.scatter(x, y, color=colors[idx2], s=s, lw=0, marker=markers[idx2])
    m1 = mlines.Line2D([], [], color=colors[0], marker=markers[0], linestyle='None', markersize=10, label='0')
    m2 = mlines.Line2D([], [], color=colors[1], marker=markers[1], linestyle='None', markersize=10, label='1')
    m3 = mlines.Line2D([], [], color=colors[2], marker=markers[2], linestyle='None', markersize=10, label='2')
    m4 = mlines.Line2D([], [], color=colors[3], marker=markers[3], linestyle='None', markersize=10, label='3')
    m5 = mlines.Line2D([], [], color=colors[4], marker=markers[4], linestyle='None', markersize=10, label='4')
    m6 = mlines.Line2D([], [], color=colors[5], marker=markers[5], linestyle='None', markersize=10, label='5')
    plt.legend(handles=[m1, m2, m3, m4, m5, m6])
    plt.tight_layout()
    plt.savefig(dataset_name + '_' + embedding +'.pdf', dpi=450)

def plot_diversity_2(D_tilde,dataset_name,n_labels,embedding): # 2 labels
    s = 100
    colors, markers = {}, {}
    colors[0], colors[1]  = 'gray', 'darkslategray'
    markers[0], markers[1] = 's', 'P'
    n_instances, n_labels, _ = D_tilde.shape
    plt.figure(figsize=(15,10))
    for idx in range(n_instances):
        for idx2 in range(n_labels):
            x, y = D_tilde[idx,idx2, 0], D_tilde[idx,idx2, 1]
            plt.scatter(x, y, color=colors[idx2], s=s, lw=0, marker=markers[idx2])
    m1 = mlines.Line2D([], [], color=colors[0], marker=markers[0], linestyle='None', markersize=10, label=0)
    m2 = mlines.Line2D([], [], color=colors[1], marker=markers[1], linestyle='None', markersize=10, label=1)
    plt.legend(handles=[m1, m2])#, m3, m4, m5])
    plt.tight_layout()
    plt.savefig(dataset_name + '_' + embedding +'.pdf', dpi=450)

def plot_cv(df_raw_cv, n_labels):
    df_raw_cv = df_raw_cv
    matriz = cv.fit_transform(df_raw_cv['statement'])
    df_cv = pd.DataFrame(matriz.toarray())
    df_cv['label'] = df_raw_cv['label']
    n_instances = df_cv.shape[0] # tamanho dos dados
    n_labels = n_labels # quantidade de labels
    label, methods = df_cv['label'], df_cv.drop('label', axis=1)
    D = df_cv
    D[D==np.inf] = 0
    D_tilde = method.fit_transform(D)
    if n_labels==2:
      shape_x=int(D_tilde.shape[0]/2)
    elif n_labels==6:
      shape_x=int(D_tilde.shape[0]/6)
    D_tilde2 = D_tilde.reshape(shape_x, n_labels, 2)
    return df_cv,D_tilde2

In [9]:
%%capture
# geração de gráficos para cada dataset
df_raw_liar = liar_val[['statement','label']].copy() # liar dataset
cv.fit_transform([df_raw_liar['statement'][0]])
D_tilde2_liar = plot_cv(df_raw_liar,6)[1]
plot_diversity_6(D_tilde2_liar,'liar',6,'cv')

df_raw_senti = senti_val[['statement','label']].copy() # senti dataset
cv.fit_transform([df_raw_senti['statement'][0]])
D_tilde2_senti = plot_cv(df_raw_senti,2)[1]
plot_diversity_2(D_tilde2_senti,'senti',2,'cv')

Referência:

[1] Upadhayay, B., & Behzadan, V. (2020, November). Sentimental liar: Extended corpus and deep learning models for fake claim classification. In 2020 IEEE International Conference on Intelligence and Security Informatics (ISI) (pp. 1-6). IEEE.