# Lista con métricas por texto (clustering, eficiencia global, centralidad,etc)

In [1]:
!pip install text2graphapi #Instalar la API

Note: you may need to restart the kernel to use updated packages.




In [2]:
from text2graphapi.src.Cooccurrence import Cooccurrence #Se importa el grafo tipo co-ocurrencia

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


2024-06-05 19:34:47,750; - INFO; - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-06-05 19:34:47,751; - INFO; - NumExpr defaulting to 8 threads.
2024-06-05 19:34:48,300; - DEBUG; - Import libraries/modules from :PROD


In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from PIL import Image
import matplotlib.pyplot as plt

## Subtask_1

In [4]:
ruta_1 = 'se agrega la ruta del dataset original .jsonl'

df = pd.read_json(ruta_1, lines=True)

#df = df.iloc[:50000] - Esta línea es si se quiere trabajar con sólo una parte del dataset

df.to_csv('data-2.csv', index=False)
df.head()

Unnamed: 0,id,text,label
0,15442,Es un tribunal de suprema instanza que forma p...,generated
1,26409,Aixó de tindrer la platgeta tant prou era un p...,human
2,12777,Doug mcadam es un científico estadista estadou...,generated
3,63467,"El president de la generalitat, carles puigdem...",generated
4,41050,"Traídas de francia, tan estrafalarias que una ...",generated


In [5]:
#Se separan los datos por label: 'human' y 'generated'
df_human = df.loc[df.label == 'human']
df_generated = df.loc[df.label == 'generated']

In [6]:
#Se seleccionan sólo las columnas id, text y se renombra text por 'doc'
df_human = df_human[['id', 'text']]
df_human = df_human.rename(columns={'text': 'doc'})
doc_h = df_human.to_dict(orient='records')



df_generated = df_generated[['id', 'text']]
df_generated = df_generated.rename(columns={'text': 'doc'})
doc_g = df_generated.to_dict(orient='records')

In [7]:
print('documentos-humano: ',len(doc_h), '\n','documentos-máquina: ',len(doc_g))

documentos-humano:  23239 
 documentos-máquina:  26761


In [8]:
#Se define el grafo tipo co-ocurrencia
coocc_graph = Cooccurrence(graph_type = 'DiGraph',
                                   language = 'sp',
                                   apply_prep = False,
                                   window_size= 3,
                                   output_format = 'networkx')

2024-06-05 19:36:17,555; - INFO; - Has already installed spacy model es_core_news_sm


In [9]:
#Se divide en human-generated
coocc_graph_human = coocc_graph.transform(doc_h)
coocc_graph_generated = coocc_graph.transform(doc_g)

2024-06-05 19:36:24,358; - INFO; - Init transformations: Text to Co-Ocurrence Graph
2024-06-05 19:36:24,358; - INFO; - Transforming 23239 text documents...
2024-06-05 19:38:24,047; - INFO; - Done transformations
2024-06-05 19:38:24,758; - INFO; - Init transformations: Text to Co-Ocurrence Graph
2024-06-05 19:38:24,759; - INFO; - Transforming 26761 text documents...
2024-06-05 19:40:44,084; - INFO; - Done transformations


Se hacen 2 listas para guardar los grafos

In [10]:
list_graph_human = []

for grafo in coocc_graph_human:
    graph = grafo['graph']
    list_graph_human.append(graph)

In [11]:
list_graph_generated = []

for grafo in coocc_graph_generated:
    graph = grafo['graph']
    list_graph_generated.append(graph)

## Comienzan las métricas de la paquetería Networkx

In [12]:
def clustering(list_graph):
    list_clustering = []
    for grafo in list_graph:
        clustering = nx.average_clustering(grafo)
        list_clustering.append(clustering)
    return  list_clustering

In [13]:
clh = clustering(list_graph_human)
clg = clustering(list_graph_generated)

In [14]:
print(clh[:5])

[0.34888888888888886, 0.3397842897842898, 0.3611272678878667, 0.33643752427015106, 0.3145035427270385]


In [15]:
def efficiency(list_graph):
    list_efficiency = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        efficiency = nx.global_efficiency(grafo)
        list_efficiency.append(efficiency)
    return list_efficiency

In [16]:
eh = efficiency(list_graph_human)
eg = efficiency(list_graph_generated)

In [17]:
def closeness(list_graph):
    list_closeness = []
    for grafo in list_graph:
        closeness = nx.closeness_centrality(grafo)
        list_closeness.append(sum(closeness.values()) / len(closeness.values()))
    return list_closeness

In [18]:
ch = closeness(list_graph_human)
cg = closeness(list_graph_generated)

In [19]:
print(ch[:5])

[0.25404428904428905, 0.31705533547298237, 0.35152603855421094, 0.36422087769920386, 0.3277932349479995]


In [20]:
def degree(list_graph):
    list_degree = []
    for grafo in list_graph:
        degree = np.mean(list(dict(grafo.degree()).values()))
        list_degree.append(degree)
    return list_degree

In [21]:
dh = degree(list_graph_human)
dg = degree(list_graph_generated)

In [22]:
def degree_cent(list_graph):
    list_degree_cent = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality(grafo).values())
        count = np.mean(centrality)
        list_degree_cent.append(count)
    return list_degree_cent

In [23]:
dc_h = degree_cent(list_graph_human)
dc_g = degree_cent(list_graph_generated)

In [24]:
def betweenness(list_graph):
    list_betweenness = []
    for grafo in list_graph:
        betweenness = list(nx.betweenness_centrality(grafo).values())
        count = np.mean(betweenness)
        list_betweenness.append(count)
    return list_betweenness

In [25]:
bh = betweenness(list_graph_human)
bg = betweenness(list_graph_generated)

In [26]:
def neighbor_degree(list_graph):
    list_neighbor = []
    for grafo in list_graph:
        count = np.mean(list(nx.average_neighbor_degree(grafo).values()))
        list_neighbor.append(count)
    return list_neighbor

In [27]:
nh = neighbor_degree(list_graph_human)
ng = neighbor_degree(list_graph_generated)

In [28]:
def assortativity(list_graph):
    list_assortativity = []
    for grafo in list_graph:
        try:
            assort = nx.degree_pearson_correlation_coefficient(grafo)
        except:
            assort = np.nan
        list_assortativity.append(assort)
    return list_assortativity

In [29]:
as_h = assortativity(list_graph_human)
as_g = assortativity(list_graph_generated)

In [30]:
def transitivity(list_graph):
    list_transitivity = []
    for grafo in list_graph:
        trans = nx.transitivity(grafo)
        list_transitivity.append(trans)
    return list_transitivity

In [31]:
th = transitivity(list_graph_human)
tg = transitivity(list_graph_generated)

In [32]:
def number_components(list_graph):
    list_number_components = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        num_components = nx.number_connected_components(grafo)
        list_number_components.append(num_components)
    return list_number_components

In [33]:
nc_h = number_components(list_graph_human)
nc_g = number_components(list_graph_generated)

In [34]:
def number_strong_comp(list_graph):
    list_number_strong_comp = []
    for grafo in list_graph:
        num_strong_components = nx.number_strongly_connected_components(grafo)
        list_number_strong_comp.append(num_strong_components)
    return list_number_strong_comp

In [35]:
nsc_h = number_strong_comp(list_graph_human)
nsc_g = number_strong_comp(list_graph_generated)

In [36]:
def entropy(cent_measure):
    m = np.asarray(cent_measure, float)
    dist = m / m.sum()
    ent = np.nansum(dist * np.log2(1.0 / dist))
    return ent

In [37]:
def entropy_centr(list_graph):
    list_entropy_centr = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_centr.append(num_entropy)
    return list_entropy_centr

In [38]:
ect_h = entropy_centr(list_graph_human)
ect_g = entropy_centr(list_graph_generated)

In [39]:
def entropy_clos(list_graph):
    list_entropy_clos = []
    for grafo in list_graph:
        centrality = list(nx.closeness_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_clos.append(num_entropy)
    return list_entropy_clos

In [40]:
ecl_h = entropy_clos(list_graph_human)
ecl_g = entropy_clos(list_graph_generated)

In [41]:
def sparseness(list_graph):
    list_sparseness = []
    for grafo in list_graph:
        mat = nx.adjacency_matrix((grafo)).todense()
        num_num = np.count_nonzero(mat)
        num_val = np.prod(mat.shape)
        sparseness = float(num_num) / num_val
        list_sparseness.append(sparseness)
    return list_sparseness

In [42]:
sh = sparseness(list_graph_human)
sg = sparseness(list_graph_generated)

### Humano

In [44]:
df_label_h = df.loc[df.label == 'human']['label']

df_label_h.head()

1     human
7     human
8     human
9     human
11    human
Name: label, dtype: object

In [45]:
label_list_h = df_label_h.tolist()
print(len(label_list_h))

23239


In [46]:
id_list_h = df_human['id'].tolist()
print(len(id_list_h))

23239


In [48]:
#Se crea otro dataframe donde se guarden las métricas por texto
df_h = pd.DataFrame({'id': id_list_h,
                  'label': label_list_h,
                  'clustering': clh,
                  'global_efficiency': eh,
                  'closeness': ch,
                 'degree': dh,
                 'degree_centrality': dc_h,
                'betweenness': bh,
                'neighbor_degree': nh,
                'assortativity': as_h,
                'transitivity': th,
                'number_strong_comp': nsc_h,
                'entropy_centr': ect_h,
                'entropy_clos': ecl_h,
                'sparseness': sh})
df_h.head()

Unnamed: 0,id,label,clustering,global_efficiency,closeness,degree,degree_centrality,betweenness,neighbor_degree,assortativity,transitivity,number_components,number_strong_comp,entropy_centr,entropy_clos,sparseness
0,26409,human,0.348889,0.615238,0.254044,5.2,0.371429,0.043956,2.211111,-0.073394,0.5,1,15,3.871081,3.732006,0.173333
1,100408,human,0.339784,0.624728,0.317055,6.0,0.352941,0.062296,3.175926,-0.113348,0.459016,1,7,4.10553,3.950981,0.166667
2,55864,human,0.361127,0.465045,0.351526,10.198953,0.053679,0.010018,15.982268,-0.174552,0.138687,1,1,6.985849,7.564889,0.026699
3,68184,human,0.336438,0.536043,0.364221,9.056604,0.174165,0.03531,6.459811,-0.129706,0.256019,1,1,5.498768,5.713314,0.08544
4,123641,human,0.314504,0.555026,0.327793,7.055556,0.201587,0.050747,4.654233,-0.133673,0.3,1,4,5.026535,5.059361,0.097994


### Generado

In [49]:
id_list_g = df_generated['id'].tolist()
print(len(id_list_g))

26761


In [50]:
df_label_g = df.loc[df.label == 'generated']['label']

df_label_g.head()

0    generated
2    generated
3    generated
4    generated
5    generated
Name: label, dtype: object

In [51]:
label_list_g = df_label_g.tolist()
print(len(label_list_g))

26761


In [52]:
df_g = pd.DataFrame({'id': id_list_g,
                  'label': label_list_g,
                  'clustering': clg,
                  'global_efficiency': eg,
                  'closeness': cg,
                    'degree': dg,
                    'degree_centrality': dc_g,
                    'betweenness': bg,
                    'neighbor_degree': ng,
                    'assortativity': as_g,
                    'transitivity': tg,
                    'number_strong_comp': nsc_g,
                    'entropy_centr': ect_g,
                    'entropy_clos': ecl_g,
                    'sparseness': sg})
df_g.head()

Unnamed: 0,id,label,clustering,global_efficiency,closeness,degree,degree_centrality,betweenness,neighbor_degree,assortativity,transitivity,number_components,number_strong_comp,entropy_centr,entropy_clos,sparseness
0,15442,generated,0.391811,0.475953,0.345198,10.0,0.077519,0.015013,13.370564,-0.178036,0.174899,1,2,6.484538,7.006989,0.038462
1,12777,generated,0.339836,0.459357,0.314912,9.902913,0.097087,0.020272,8.167137,-0.027979,0.223178,1,5,6.348321,6.669307,0.048072
2,63467,generated,0.352597,0.476843,0.353801,10.255034,0.069291,0.012751,13.198701,-0.172325,0.159559,1,1,6.69761,7.206485,0.034413
3,41050,generated,0.335001,0.513095,0.3315,8.607143,0.156494,0.03388,6.169488,-0.029979,0.25832,1,4,5.558803,5.763531,0.076849
4,91814,generated,0.297962,0.495914,0.297067,7.06383,0.153562,0.05461,4.203014,-0.11493,0.318352,1,1,5.448504,5.535447,0.075147


In [53]:
df_subtask_1_50k = pd.concat([df_h, df_g])

df_subtask_1_50k

Unnamed: 0,id,label,clustering,global_efficiency,closeness,degree,degree_centrality,betweenness,neighbor_degree,assortativity,transitivity,number_components,number_strong_comp,entropy_centr,entropy_clos,sparseness
0,26409,human,0.348889,0.615238,0.254044,5.200000,0.371429,0.043956,2.211111,-0.073394,0.500000,1,15,3.871081,3.732006,0.173333
1,100408,human,0.339784,0.624728,0.317055,6.000000,0.352941,0.062296,3.175926,-0.113348,0.459016,1,7,4.105530,3.950981,0.166667
2,55864,human,0.361127,0.465045,0.351526,10.198953,0.053679,0.010018,15.982268,-0.174552,0.138687,1,1,6.985849,7.564889,0.026699
3,68184,human,0.336438,0.536043,0.364221,9.056604,0.174165,0.035310,6.459811,-0.129706,0.256019,1,1,5.498768,5.713314,0.085440
4,123641,human,0.314504,0.555026,0.327793,7.055556,0.201587,0.050747,4.654233,-0.133673,0.300000,1,4,5.026535,5.059361,0.097994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26756,30240,generated,0.408820,0.548174,0.329263,8.368421,0.226174,0.034890,6.218241,-0.104354,0.307592,1,6,4.984726,5.099313,0.110111
26757,144408,generated,0.329081,0.453863,0.337580,10.486772,0.055781,0.010797,13.094265,-0.141179,0.161450,1,1,7.043000,7.548235,0.027743
26758,60641,generated,0.404915,0.501605,0.388964,12.792453,0.080965,0.010262,19.268053,-0.245768,0.174887,1,1,6.649867,7.300717,0.040228
26759,108938,generated,0.360972,0.563333,0.275443,6.538462,0.261538,0.059615,3.555128,-0.148245,0.409091,1,5,4.623933,4.627867,0.125740


In [54]:
df_subtask_1_50k.to_csv('nombre del dataset obtenido.csv', index=False)

## Subtask_2

Para la subtask 2 se hace lo mismo pero se separan los textos por ABCDEF que corresponden a distintas máquinas generadoras de texto

In [57]:
ruta_2 = 'se agrega la ruta del dataset original .jsonl'

df_2 = pd.read_json(ruta_2, lines=True)

#df_2 = df_2.iloc[29378:]

df_2.to_csv('data-2.csv', index=False)
df_2.head()

Unnamed: 0,id,text,label
29378,69729,Colville (washington) euskal herriko ameriketa...,D
29379,3509,Título: asentamiento munroe. el asentamiento m...,F
29380,3756,O capitalismo é um sistema econômico que se ca...,C
29381,30644,¡hola @juanes! los millennials somos una gener...,E
29382,145660,"T""estimo, oh bella flor, amb tota la meva ànim...",D


In [58]:
df_ABCDEF = df_2[['id', 'text']]
df_ABCDEF = df_ABCDEF.rename(columns={'text': 'doc'})
doc_ABCDEF = df_ABCDEF.to_dict(orient='records')

In [59]:
print('documentos-generados (A, B, C, D, E, F): ',len(doc_ABCDEF))

documentos-generados (A, B, C, D, E, F):  29376


In [60]:
coocc_graph_ABCDEF = coocc_graph.transform(doc_ABCDEF)

2024-05-15 22:02:24,943; - INFO; - Init transformations: Text to Co-Ocurrence Graph
2024-05-15 22:02:24,948; - INFO; - Transforming 29376 text documents...
2024-05-15 22:10:18,136; - INFO; - Done transformations


In [61]:
list_graph_ABCDEF = []

for grafo in coocc_graph_ABCDEF:
    graph = grafo['graph']
    list_graph_ABCDEF.append(graph)

In [62]:
cl_ABCDEF = clustering(list_graph_ABCDEF)

In [63]:
e_ABCDEF = efficiency(list_graph_ABCDEF)

In [64]:
c_ABCDEF = closeness(list_graph_ABCDEF)

In [65]:
d_ABCDEF = degree(list_graph_ABCDEF)

In [66]:
dc_ABCDEF = degree_cent(list_graph_ABCDEF)

In [67]:
b_ABCDEF = betweenness(list_graph_ABCDEF)

In [68]:
n_ABCDEF = neighbor_degree(list_graph_ABCDEF)

In [69]:
as_ABCDEF = assortativity(list_graph_ABCDEF)

In [70]:
t_ABCDEF = transitivity(list_graph_ABCDEF)

In [71]:
nc_ABCDEF = number_components(list_graph_ABCDEF)

In [72]:
nsc_ABCDEF = number_strong_comp(list_graph_ABCDEF)

In [73]:
ect_ABCDEF = entropy_centr(list_graph_ABCDEF)

In [75]:
ecl_ABDCEF = entropy_clos(list_graph_ABCDEF)

In [76]:
s_ABCDEF = sparseness(list_graph_ABCDEF)

In [77]:
id_list_ABCDEF = df_ABCDEF['id'].tolist()
print(len(id_list_ABCDEF))

29376


In [78]:
label_list_ABCDEF = df_2['label'].tolist()
print(len(label_list_ABCDEF))

29376


In [81]:
df_subtask_2 = pd.DataFrame({'id': id_list_ABCDEF,
                          'label': label_list_ABCDEF,
                          'clustering': cl_ABCDEF,
                          'global_efficiency': e_ABCDEF,
                          'closeness': c_ABCDEF,
                                     'degree': d_ABCDEF,
                            'degree_centrality': dc_ABCDEF,
                            'betweenness': b_ABCDEF,
                            'neighbor_degree': n_ABCDEF,
                            'assortativity': as_ABCDEF,
                            'transitivity': t_ABCDEF,
                            'number_strong_comp': nsc_ABCDEF,
                            'entropy_centr': ect_ABCDEF,
                            'entropy_clos': ecl_ABDCEF,
                            'sparseness': s_ABCDEF})
df_subtask_2.head()

Unnamed: 0,id,label,clustering,global_efficiency,closeness,degree,degree_centrality,betweenness,neighbor_degree,assortativity,transitivity,number_components,number_strong_comp,entropy_centr,entropy_clos,sparseness
0,69729,D,0.339789,0.470458,0.283324,7.462687,0.113071,0.035442,5.74213,-0.118597,0.245791,1,5,5.825955,6.041621,0.055692
1,3509,F,0.376099,0.463288,0.344715,10.473373,0.062342,0.011197,14.296475,-0.145602,0.178945,1,4,6.810658,7.370263,0.030986
2,3756,C,0.384025,0.594862,0.285418,7.217391,0.328063,0.047525,3.736232,0.038849,0.381295,1,5,4.426953,4.40937,0.1569
3,30644,E,0.35641,0.660256,0.27962,5.076923,0.423077,0.041958,2.089744,-0.087912,0.5,1,13,3.661099,3.503301,0.195266
4,145660,D,0.359096,0.49774,0.368632,10.25641,0.088417,0.015355,12.302588,-0.182033,0.181038,1,1,6.408863,6.855572,0.043831


In [82]:
df_subtask_2.to_csv('data-subtask_2.csv', index=False)