# Lista con métricas por texto (clustering, eficiencia global, centralidad,etc)

In [None]:
!pip install text2graphapi #Instalar la API

In [None]:
from text2graphapi.src.Cooccurrence import Cooccurrence #Se importa el grafo tipo co-ocurrencia

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
from PIL import Image
import matplotlib.pyplot as plt

## Subtask_1

In [None]:
ruta_1 = 'se agrega la ruta del dataset original .jsonl'

df = pd.read_json(ruta_1, lines=True)

#df = df.iloc[:50000] - Esta línea es si se quiere trabajar con sólo una parte del dataset

df.to_csv('data-2.csv', index=False)
df.head()

In [None]:
#Se separan los datos por label: 'human' y 'generated'
df_human = df.loc[df.label == 'human']
df_generated = df.loc[df.label == 'generated']

In [None]:
#Se seleccionan sólo las columnas id, text y se renombra text por 'doc'
df_human = df_human[['id', 'text']]
df_human = df_human.rename(columns={'text': 'doc'})
doc_h = df_human.to_dict(orient='records')



df_generated = df_generated[['id', 'text']]
df_generated = df_generated.rename(columns={'text': 'doc'})
doc_g = df_generated.to_dict(orient='records')

In [None]:
print('documentos-humano: ',len(doc_h), '\n','documentos-máquina: ',len(doc_g))

In [None]:
#Se define el grafo tipo co-ocurrencia
coocc_graph = Cooccurrence(graph_type = 'DiGraph',
                                   language = 'sp',
                                   apply_prep = False,
                                   window_size= 3,
                                   output_format = 'networkx')

In [None]:
#Se divide en human-generated
coocc_graph_human = coocc_graph.transform(doc_h)
coocc_graph_generated = coocc_graph.transform(doc_g)

Se hacen 2 listas para guardar los grafos

In [None]:
list_graph_human = []

for grafo in coocc_graph_human:
    graph = grafo['graph']
    list_graph_human.append(graph)

In [None]:
list_graph_generated = []

for grafo in coocc_graph_generated:
    graph = grafo['graph']
    list_graph_generated.append(graph)

## Comienzan las métricas de la paquetería Networkx

In [None]:
def clustering(list_graph):
    list_clustering = []
    for grafo in list_graph:
        clustering = nx.average_clustering(grafo)
        list_clustering.append(clustering)
    return  list_clustering

In [None]:
clh = clustering(list_graph_human)
clg = clustering(list_graph_generated)

In [None]:
print(clh[:5])

In [None]:
def efficiency(list_graph):
    list_efficiency = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        efficiency = nx.global_efficiency(grafo)
        list_efficiency.append(efficiency)
    return list_efficiency

In [None]:
eh = efficiency(list_graph_human)
eg = efficiency(list_graph_generated)

In [None]:
def closeness(list_graph):
    list_closeness = []
    for grafo in list_graph:
        closeness = nx.closeness_centrality(grafo)
        list_closeness.append(sum(closeness.values()) / len(closeness.values()))
    return list_closeness

In [None]:
ch = closeness(list_graph_human)
cg = closeness(list_graph_generated)

In [None]:
print(ch[:5])

In [None]:
def degree(list_graph):
    list_degree = []
    for grafo in list_graph:
        degree = np.mean(list(dict(grafo.degree()).values()))
        list_degree.append(degree)
    return list_degree

In [None]:
dh = degree(list_graph_human)
dg = degree(list_graph_generated)

In [None]:
def degree_cent(list_graph):
    list_degree_cent = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality(grafo).values())
        count = np.mean(centrality)
        list_degree_cent.append(count)
    return list_degree_cent

In [None]:
dc_h = degree_cent(list_graph_human)
dc_g = degree_cent(list_graph_generated)

In [None]:
def betweenness(list_graph):
    list_betweenness = []
    for grafo in list_graph:
        betweenness = list(nx.betweenness_centrality(grafo).values())
        count = np.mean(betweenness)
        list_betweenness.append(count)
    return list_betweenness

In [None]:
bh = betweenness(list_graph_human)
bg = betweenness(list_graph_generated)

In [None]:
def neighbor_degree(list_graph):
    list_neighbor = []
    for grafo in list_graph:
        count = np.mean(list(nx.average_neighbor_degree(grafo).values()))
        list_neighbor.append(count)
    return list_neighbor

In [None]:
nh = neighbor_degree(list_graph_human)
ng = neighbor_degree(list_graph_generated)

In [None]:
def assortativity(list_graph):
    list_assortativity = []
    for grafo in list_graph:
        try:
            assort = nx.degree_pearson_correlation_coefficient(grafo)
        except:
            assort = np.nan
        list_assortativity.append(assort)
    return list_assortativity

In [None]:
as_h = assortativity(list_graph_human)
as_g = assortativity(list_graph_generated)

In [None]:
def transitivity(list_graph):
    list_transitivity = []
    for grafo in list_graph:
        trans = nx.transitivity(grafo)
        list_transitivity.append(trans)
    return list_transitivity

In [None]:
th = transitivity(list_graph_human)
tg = transitivity(list_graph_generated)

In [None]:
def number_components(list_graph):
    list_number_components = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        num_components = nx.number_connected_components(grafo)
        list_number_components.append(num_components)
    return list_number_components

In [None]:
nc_h = number_components(list_graph_human)
nc_g = number_components(list_graph_generated)

In [None]:
def number_strong_comp(list_graph):
    list_number_strong_comp = []
    for grafo in list_graph:
        num_strong_components = nx.number_strongly_connected_components(grafo)
        list_number_strong_comp.append(num_strong_components)
    return list_number_strong_comp

In [None]:
nsc_h = number_strong_comp(list_graph_human)
nsc_g = number_strong_comp(list_graph_generated)

In [None]:
def entropy(cent_measure):
    m = np.asarray(cent_measure, float)
    dist = m / m.sum()
    ent = np.nansum(dist * np.log2(1.0 / dist))
    return ent

In [None]:
def entropy_centr(list_graph):
    list_entropy_centr = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_centr.append(num_entropy)
    return list_entropy_centr

In [None]:
ect_h = entropy_centr(list_graph_human)
ect_g = entropy_centr(list_graph_generated)

In [None]:
def entropy_clos(list_graph):
    list_entropy_clos = []
    for grafo in list_graph:
        centrality = list(nx.closeness_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_clos.append(num_entropy)
    return list_entropy_clos

In [None]:
ecl_h = entropy_clos(list_graph_human)
ecl_g = entropy_clos(list_graph_generated)

In [None]:
def sparseness(list_graph):
    list_sparseness = []
    for grafo in list_graph:
        mat = nx.adjacency_matrix((grafo)).todense()
        num_num = np.count_nonzero(mat)
        num_val = np.prod(mat.shape)
        sparseness = float(num_num) / num_val
        list_sparseness.append(sparseness)
    return list_sparseness

In [None]:
sh = sparseness(list_graph_human)
sg = sparseness(list_graph_generated)

### Humano

In [None]:
df_label_h = df.loc[df.label == 'human']['label']

df_label_h.head()

In [None]:
label_list_h = df_label_h.tolist()
print(len(label_list_h))

In [None]:
id_list_h = df_human['id'].tolist()
print(len(id_list_h))

In [None]:
#Se crea otro dataframe donde se guarden las métricas por texto
df_h = pd.DataFrame({'id': id_list_h,
                  'label': label_list_h,
                  'clustering': clh,
                  'global_efficiency': eh,
                  'closeness': ch,
                 'degree': dh,
                 'degree_centrality': dc_h,
                'betweenness': bh,
                'neighbor_degree': nh,
                'assortativity': as_h,
                'transitivity': th,
                'number_strong_comp': nsc_h,
                'entropy_centr': ect_h,
                'entropy_clos': ecl_h,
                'sparseness': sh})
df_h.head()

### Generado

In [None]:
id_list_g = df_generated['id'].tolist()
print(len(id_list_g))

In [None]:
df_label_g = df.loc[df.label == 'generated']['label']

df_label_g.head()

In [None]:
label_list_g = df_label_g.tolist()
print(len(label_list_g))

In [None]:
df_g = pd.DataFrame({'id': id_list_g,
                  'label': label_list_g,
                  'clustering': clg,
                  'global_efficiency': eg,
                  'closeness': cg,
                    'degree': dg,
                    'degree_centrality': dc_g,
                    'betweenness': bg,
                    'neighbor_degree': ng,
                    'assortativity': as_g,
                    'transitivity': tg,
                    'number_strong_comp': nsc_g,
                    'entropy_centr': ect_g,
                    'entropy_clos': ecl_g,
                    'sparseness': sg})
df_g.head()

In [None]:
df_subtask_1_50k = pd.concat([df_h, df_g])

df_subtask_1_50k

In [None]:
df_subtask_1_50k.to_csv('nombre del dataset obtenido.csv', index=False)

## Subtask_2

Para la subtask 2 se hace lo mismo pero se separan los textos por ABCDEF que corresponden a distintas máquinas generadoras de texto

In [None]:
ruta_2 = 'se agrega la ruta del dataset original .jsonl'

df_2 = pd.read_json(ruta_2, lines=True)

#df_2 = df_2.iloc[29378:]

df_2.to_csv('data-2.csv', index=False)
df_2.head()

In [None]:
df_ABCDEF = df_2[['id', 'text']]
df_ABCDEF = df_ABCDEF.rename(columns={'text': 'doc'})
doc_ABCDEF = df_ABCDEF.to_dict(orient='records')

In [None]:
print('documentos-generados (A, B, C, D, E, F): ',len(doc_ABCDEF))

In [None]:
coocc_graph_ABCDEF = coocc_graph.transform(doc_ABCDEF)

In [None]:
list_graph_ABCDEF = []

for grafo in coocc_graph_ABCDEF:
    graph = grafo['graph']
    list_graph_ABCDEF.append(graph)

In [None]:
cl_ABCDEF = clustering(list_graph_ABCDEF)

In [None]:
e_ABCDEF = efficiency(list_graph_ABCDEF)

In [None]:
c_ABCDEF = closeness(list_graph_ABCDEF)

In [None]:
d_ABCDEF = degree(list_graph_ABCDEF)

In [None]:
dc_ABCDEF = degree_cent(list_graph_ABCDEF)

In [None]:
b_ABCDEF = betweenness(list_graph_ABCDEF)

In [None]:
n_ABCDEF = neighbor_degree(list_graph_ABCDEF)

In [None]:
as_ABCDEF = assortativity(list_graph_ABCDEF)

In [None]:
t_ABCDEF = transitivity(list_graph_ABCDEF)

In [None]:
nc_ABCDEF = number_components(list_graph_ABCDEF)

In [None]:
nsc_ABCDEF = number_strong_comp(list_graph_ABCDEF)

In [None]:
ect_ABCDEF = entropy_centr(list_graph_ABCDEF)

In [None]:
ecl_ABDCEF = entropy_clos(list_graph_ABCDEF)

In [None]:
s_ABCDEF = sparseness(list_graph_ABCDEF)

In [None]:
id_list_ABCDEF = df_ABCDEF['id'].tolist()
print(len(id_list_ABCDEF))

In [None]:
label_list_ABCDEF = df_2['label'].tolist()
print(len(label_list_ABCDEF))

In [None]:
df_subtask_2 = pd.DataFrame({'id': id_list_ABCDEF,
                          'label': label_list_ABCDEF,
                          'clustering': cl_ABCDEF,
                          'global_efficiency': e_ABCDEF,
                          'closeness': c_ABCDEF,
                                     'degree': d_ABCDEF,
                            'degree_centrality': dc_ABCDEF,
                            'betweenness': b_ABCDEF,
                            'neighbor_degree': n_ABCDEF,
                            'assortativity': as_ABCDEF,
                            'transitivity': t_ABCDEF,
                            'number_strong_comp': nsc_ABCDEF,
                            'entropy_centr': ect_ABCDEF,
                            'entropy_clos': ecl_ABDCEF,
                            'sparseness': s_ABCDEF})
df_subtask_2.head()

In [None]:
df_subtask_2.to_csv('data-subtask_2.csv', index=False)