In [1]:
import os
import io
import re
import multiprocessing

In [2]:
import neuralcoref
import spacy

print('spacy version:', spacy.__version__)
print('neuralcoref version:', neuralcoref.__version__)

spacy version: 2.1.3
neuralcoref version: 4.0.0


# Reconhecimento de entidades

In [3]:
def read_file(path):
    try:
        with io.open(path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        try:
            with io.open(path, 'r', encoding='latin-1') as f:
                return f.read()
        except UnicodeDecodeError:
            return "Could not decode"

In [4]:
def extract_paragraphs(folder, file):
    path = os.path.join(folder, file) # name file

    if os.path.isfile(path):
        content = read_file(path)
        print(f"Content of book ... {file}")

        paragraphs = content.split('\n\n')

        return paragraphs
    return []

In [5]:
def new_paragraphs(paragraphs):
    paragraphs_new = []
    contents = []
    chapters = []
    old_idx = 0

    for paragraph in paragraphs:
        lines = paragraph.split('\n')

        if re.match('[IVXLCDM\d]+[\.]*', lines[0].upper().strip()):
            if not contents:
                contents = [" ".join(line.lower().split()) for line in lines]
                
                if not contents:
                    print("Not found content", id_file)
                    exit()
            continue

        if paragraph.lower().strip() in contents:
            if old_idx == 0:
                old_idx = len(paragraphs_new)
            else:
                current_idx = len(paragraphs_new)
                chapter = "\n\n".join(paragraphs_new[old_idx:current_idx])
                chapters.append(chapter)
                old_idx = current_idx
            continue

        if not re.match('.*[\w]+.*', paragraph):
            continue

        paragraphs_new.append(paragraph)

        if old_idx != 0:
            chapter = "\n\n".join(paragraphs_new[old_idx:])
            chapters.append(chapter)
        
    return paragraphs_new

In [6]:
def process_paragraph(paragraph, nlp):
    paragraph = paragraph.replace('\n', ' ')
    paragraph = " ".join(paragraph.split())
    doc = nlp(paragraph)
    doc_coref = doc._.coref_resolved
    doc_coref = nlp(doc_coref)

    entities = [ent.text for ent in doc_coref.ents if ent.label_ == "PERSON"]

    return entities

def extract_entities_from_paragraphs(paragraphs_new, nlp):
    data = []
    geral_entities = []

    paragraphs_to_process = paragraphs_new  # Adjust as needed

    with multiprocessing.Pool() as pool:
        results = pool.starmap(process_paragraph, [(p, nlp) for p in paragraphs_to_process])

    for i, entities in enumerate(results):
        if entities:
            len_words = len(paragraphs_to_process[i].split())
            geral_entities.extend(entities)
            data.append((i, len_words, entities))
            # print(i, len_words, entities)

    return data, geral_entities

In [7]:
def all_entities(folder, file):     
    paragraphs = extract_paragraphs(folder, file)
    paragraphs_new = new_paragraphs(paragraphs)
    
    print('Número de parágrafos:', len(paragraphs))
    print('Número de parágrafos com personagens: ', len(paragraphs_new))     
    
    nlp = spacy.load("en_core_web_sm")
    neuralcoref.add_to_pipe(nlp)
    
    data, geral_entities = extract_entities_from_paragraphs(paragraphs_new, nlp)
    
    return data, geral_entities

# Criação de rede de co-ocorrência

In [8]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
def cooccurrence_matrix(data, entities_list):
    # Inicializa a matriz de coocorrência
    cooccurrence_matrix = np.zeros((len(entities_list), len(entities_list)), dtype=int)

    # Preenche a matriz de coocorrência
    for _, _, paragraph_entities in data:
        for i in range(len(entities_list)):
            if entities_list[i] in paragraph_entities:
                for j in range(i, len(entities_list)):
                    if entities_list[j] in paragraph_entities:
                        cooccurrence_matrix[i][j] += 1
                        if i != j:
                            cooccurrence_matrix[j][i] += 1
    return np.array(cooccurrence_matrix)

In [19]:
def edge_list(entities_list, co_matrix):
    edge_list = []

    for i in range(len(entities_list)):
        for j in range(i + 1, len(entities_list)):
            weight = co_matrix[i][j]
            if weight > 0:
                edge_list.append((entities_list[i], entities_list[j], {'weight': weight}))
    
    return edge_list

In [20]:
def plot_network_degree(G, name_list, folder):
    
    # Atribuir uma cor única a cada nó
    node_colors = range(len(name_list))

    # Identifica os 15 nós com maior grau
    degrees = dict(G.degree())
    top_degree_nodes = sorted(degrees, key=degrees.get, reverse=True)[:15]

    # Determina as arestas que conectam os nós de maior grau
    edges_of_top_degree_nodes = [(u, v) for u, v in G.edges() if u in top_degree_nodes and v in top_degree_nodes]

    # Posicionamento dos nós aleatório
    pos = nx.random_layout(G)

    plt.figure(figsize=(12, 12))

    # Desenha a rede
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edge_color="#E1E5EA", # "#DBDFEA"  # Cor das arestas
            with_labels=False,  # Não mostra rótulos dos nós
            alpha= 0.7,
           ) 

    # Adiciona as arestas dos nós de maior grau em uma cor diferente
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edgelist=edges_of_top_degree_nodes,
            edge_color='red',  # Cor das arestas dos nós de maior grau
            width=1.0,  # Largura das arestas
            alpha=0.2)  # Transparência das arestas


    for node in top_degree_nodes:
        x, y = pos[node]
        plt.annotate(node, (x, y), ha='center', va='center', fontsize=10,
                     # fontweight='heavy',
                     color='#461959', #B80D57 #99235C
                     backgroundcolor='white',
                     bbox=dict(boxstyle='round,pad=0.2', edgecolor='#D3CEDF', facecolor='#F9F5F6'))

    plt.title("Rede de co-ocorrência livro: " +  str(i+1), fontsize=16)
    plt.axis('off')
    plt.savefig("graphs/Graph_" + folder + "_" +  str(i+1) + '.png', dpi=300, bbox_inches='tight')
    plt.show()

# Medidas de centralidade

In [12]:
import pandas as pd

def medidas(G):
    
    K = dict(nx.degree_centrality(G)) # Centralidade de grau
    CC = dict(nx.clustering(G)) # Clustering coeficient
    CLC = dict(nx.closeness_centrality(G)) # Closeness centrality
    B = dict(nx.betweenness_centrality(G, endpoints=True)) # Betweenness centrality
    EC = dict(nx.eigenvector_centrality(G)) # Eigenvetor centrality
    PR = dict(nx.pagerank(G, alpha=0.85)) # PageRank
    
    
    df = pd.DataFrame({'K':list(K.values()),
                       'CC':list(CC.values()),
                       'CLC':list(CLC.values()),
                       'B':list(B.values()),
                       'EC':list(EC.values()),
                       'PR':list(PR.values())})
    display(df)
    
    # Vetor com valores médios das medidas de rede de co-ocorrência
    medidas = list(df.mean())
    
    return medidas

In [21]:
def Medidas(folder):
    
    files_ordered = sorted(os.listdir(folder), key=lambda name:int(name.split('_')[0]))
    
    M = list()
    for i in range(len(files_ordered)):

        file = files_ordered[i]
        print('\n')

        data, geral_entities = all_entities(folder, file)

        # Lista das entidades (personagens)
        entities_list = list(set(geral_entities))

        # Matriz de coocorrência
        co_matrix = cooccurrence_matrix(data, entities_list)
        print("Matriz de Co-ocorrência", co_matrix.shape)

        # Lista de arestas
        edgelist = edge_list(entities_list, co_matrix)

        # Rede
        G = nx.Graph()
        G.add_nodes_from(entities_list)
        G.add_edges_from(edgelist)

        print('Personagens (nodes)', G.number_of_nodes())
        print('Ocorrência (edges)', G.number_of_edges())
        
        # Visualização de rede - 15 personagens com maior grau
        # plot_network_degree(G, entities_list, folder, file)

        medidas_rede = medidas(G)
        print('Valores médios das medidas de rede:\n', medidas_rede)

        M.append(medidas_rede) 
        
    return M

In [22]:
medidas_other = Medidas(folder="other")



Content of book ... 1_InTheLandOfCaveAndCliffDwellers_FrederickSchwatka_cleared.txt
Número de parágrafos: 301
Número de parágrafos com personagens:  191
Matriz de Co-ocorrência (74, 74)
Personagens (nodes) 74
Ocorrência (edges) 78


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.041096,1.000000,0.041096,0.001111,1.914859e-11,0.017036
1,0.000000,0.000000,0.000000,0.000000,2.416892e-40,0.002555
2,0.082192,0.666667,0.110241,0.009256,3.488031e-01,0.022500
3,0.013699,0.000000,0.082681,0.004813,8.465884e-02,0.005644
4,0.013699,0.000000,0.091532,0.007775,6.514187e-06,0.006083
...,...,...,...,...,...,...
69,0.027397,1.000000,0.027397,0.000740,1.927869e-17,0.017036
70,0.082192,0.333333,0.131328,0.017216,2.823667e-05,0.037351
71,0.054795,1.000000,0.096461,0.004813,1.873479e-01,0.016054
72,0.027397,1.000000,0.027397,0.000740,1.927869e-17,0.017036


Valores médios das medidas de rede:
 [0.028878193261754906, 0.40067567567567564, 0.059625055724921355, 0.00557351131212664, 0.0443319050197565, 0.013513513513513514]


Content of book ... 2_JudeTheObscure_ThomasHardy_cleared.txt
Número de parágrafos: 3689
Número de parágrafos com personagens:  3375
Matriz de Co-ocorrência (240, 240)
Personagens (nodes) 240
Ocorrência (edges) 326


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.117155,0.132275,0.272209,0.042035,2.504565e-01,0.050249
1,0.016736,1.000000,0.132973,0.004672,1.207040e-03,0.004569
2,0.008368,1.000000,0.188295,0.004672,2.575550e-02,0.001755
3,0.000000,0.000000,0.000000,0.000000,7.907440e-23,0.000790
4,0.000000,0.000000,0.000000,0.000000,7.907440e-23,0.000790
...,...,...,...,...,...,...
235,0.008368,1.000000,0.224939,0.004672,5.691187e-02,0.001629
236,0.004184,0.000000,0.004184,0.000035,4.145776e-17,0.005267
237,0.000000,0.000000,0.000000,0.000000,7.907440e-23,0.000790
238,0.025105,0.600000,0.171922,0.015830,6.076626e-03,0.007078


Valores médios das medidas de rede:
 [0.011366806136680611, 0.38194760572684516, 0.11725293584819303, 0.005031090190609018, 0.03201330925314385, 0.0041666666666666675]


Content of book ... 3_TheGoldenAge_KennethGrahame_cleared.txt
Número de parágrafos: 522
Número de parágrafos com personagens:  425
Matriz de Co-ocorrência (121, 121)
Personagens (nodes) 121
Ocorrência (edges) 213


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.041667,1.000000,0.349091,0.013223,1.553708e-01,0.007811
1,0.100000,0.242424,0.374634,0.123423,1.095934e-01,0.020949
2,0.008333,0.000000,0.279273,0.013223,4.342271e-02,0.002653
3,0.025000,1.000000,0.242271,0.013223,1.438830e-02,0.007147
4,0.033333,1.000000,0.263918,0.013223,1.794878e-02,0.007531
...,...,...,...,...,...,...
116,0.025000,0.333333,0.277256,0.039118,2.704389e-02,0.008064
117,0.033333,1.000000,0.204255,0.013223,4.004824e-03,0.008171
118,0.016667,1.000000,0.276259,0.013223,3.034758e-02,0.003783
119,0.000000,0.000000,0.000000,0.000000,4.819758e-22,0.001408


Valores médios das medidas de rede:
 [0.029338842975206614, 0.6008758236577877, 0.23850200101522245, 0.02002026273250916, 0.056296203958336115, 0.008264462809917357]


Content of book ... 4_TheLostStradivarius_JohnMeadeFalkner_cleared.txt
Número de parágrafos: 361
Número de parágrafos com personagens:  208
Matriz de Co-ocorrência (79, 79)
Personagens (nodes) 79
Ocorrência (edges) 232


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.076923,0.733333,0.423171,0.024180,7.163544e-02,0.009957
1,0.000000,0.000000,0.000000,0.000000,3.601074e-24,0.002053
2,0.128205,0.600000,0.423171,0.024667,9.338895e-02,0.016320
3,0.038462,1.000000,0.330754,0.021746,3.665029e-02,0.006089
4,0.012821,0.000000,0.396905,0.021746,3.612312e-02,0.003110
...,...,...,...,...,...,...
74,0.051282,1.000000,0.411081,0.021746,7.965086e-02,0.006481
75,0.025641,1.000000,0.399662,0.021746,3.925077e-02,0.005409
76,0.000000,0.000000,0.000000,0.000000,3.601074e-24,0.002053
77,0.038462,0.333333,0.405291,0.043168,5.162931e-02,0.007804


Valores médios das medidas de rede:
 [0.07530022719896137, 0.6381295271397522, 0.35277166523467224, 0.029659119388329446, 0.08078913275670614, 0.012658227848101269]


Content of book ... 5_TheBritishBarbarians_GrantAllen_cleared.txt
Número de parágrafos: 412
Número de parágrafos com personagens:  361
Matriz de Co-ocorrência (68, 68)
Personagens (nodes) 68
Ocorrência (edges) 144


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.283582,0.257310,0.481901,0.126722,3.253707e-01,0.058237
1,0.014925,0.000000,0.303807,0.023266,3.102692e-02,0.004156
2,0.014925,0.000000,0.327542,0.023266,4.219375e-02,0.003882
3,0.014925,0.000000,0.026866,0.001317,1.992841e-09,0.010886
4,0.029851,1.000000,0.268752,0.023266,2.057668e-02,0.006957
...,...,...,...,...,...,...
63,0.194030,0.294872,0.427810,0.082822,2.133678e-01,0.037288
64,0.014925,0.000000,0.279502,0.023266,2.137064e-02,0.004114
65,0.074627,1.000000,0.384636,0.023266,1.412822e-01,0.012223
66,0.059701,1.000000,0.352314,0.023266,1.068555e-01,0.009193


Valores médios das medidas de rede:
 [0.06321334503950835, 0.5063739977294617, 0.27908263576344566, 0.03098047823167897, 0.08115640735166518, 0.014705882352941176]


Content of book ... 6_TheSorrowsOfSatan_MarieCorelli_cleared.txt
Número de parágrafos: 2412
Número de parágrafos com personagens:  1990
Matriz de Co-ocorrência (318, 318)
Personagens (nodes) 318
Ocorrência (edges) 643


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.009464,1.0,0.222811,0.003988,3.240295e-02,0.001938
1,0.000000,0.0,0.000000,0.000000,2.640778e-25,0.000594
2,0.000000,0.0,0.000000,0.000000,2.640778e-25,0.000594
3,0.006309,1.0,0.151724,0.003988,5.568476e-04,0.002343
4,0.000000,0.0,0.000000,0.000000,2.640778e-25,0.000594
...,...,...,...,...,...,...
313,0.000000,0.0,0.000000,0.000000,2.640778e-25,0.000594
314,0.000000,0.0,0.000000,0.000000,2.640778e-25,0.000594
315,0.003155,0.0,0.003155,0.000020,1.384528e-19,0.003960
316,0.031546,1.0,0.233421,0.003988,7.233189e-02,0.004252


Valores médios das medidas de rede:
 [0.01275717715215364, 0.4142617949425244, 0.13784827787134105, 0.005102833426731487, 0.02736277280158943, 0.003144654088050314]


Content of book ... 7_RoseOfDutchersCoolly_HamlinGarland_cleared.txt
Número de parágrafos: 2561
Número de parágrafos com personagens:  2263
Matriz de Co-ocorrência (175, 175)
Personagens (nodes) 175
Ocorrência (edges) 247


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.000000,0.000000,0.000000,0.000000,4.763471e-24,0.001154
1,0.000000,0.000000,0.000000,0.000000,4.763471e-24,0.001154
2,0.011494,1.000000,0.224413,0.006502,7.643511e-02,0.002840
3,0.000000,0.000000,0.000000,0.000000,4.763471e-24,0.001154
4,0.005747,0.000000,0.157340,0.006502,5.743534e-03,0.003121
...,...,...,...,...,...,...
170,0.040230,0.809524,0.240716,0.012051,1.493077e-01,0.010064
171,0.011494,1.000000,0.116620,0.006502,4.064710e-04,0.005270
172,0.034483,1.000000,0.194233,0.006502,6.737298e-02,0.006577
173,0.028736,0.400000,0.028736,0.000722,4.398773e-10,0.012342


Valores médios das medidas de rede:
 [0.016223316912972085, 0.40069080171869376, 0.11624990388988954, 0.007364578935022286, 0.04034160033627053, 0.005714285714285714]


Content of book ... 8_YeklATaleOfTheNewYorkGhetto_AbrahamCahan_cleared.txt
Número de parágrafos: 652
Número de parágrafos com personagens:  570
Matriz de Co-ocorrência (90, 90)
Personagens (nodes) 90
Ocorrência (edges) 193


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.033708,1.0,0.309304,0.017478,6.746735e-02,0.004715
1,0.022472,1.0,0.291303,0.017478,3.506410e-02,0.003909
2,0.000000,0.0,0.000000,0.000000,6.814946e-17,0.001942
3,0.022472,1.0,0.289769,0.017478,3.308517e-02,0.004092
4,0.000000,0.0,0.000000,0.000000,6.814946e-17,0.001942
...,...,...,...,...,...,...
85,0.022472,1.0,0.278062,0.017478,1.989104e-02,0.005128
86,0.022472,1.0,0.335708,0.017478,5.459654e-02,0.003858
87,0.033708,1.0,0.346265,0.017478,7.361238e-02,0.004998
88,0.056180,0.7,0.357508,0.019562,8.506997e-02,0.008856


Valores médios das medidas de rede:
 [0.04818976279650437, 0.49552070137729054, 0.26663837341641855, 0.023406852545429322, 0.06786067687136291, 0.011111111111111113]


Content of book ... 9_Madelon_MaryEWilkinsFreeman_cleared.txt
Número de parágrafos: 1988
Número de parágrafos com personagens:  1588
Matriz de Co-ocorrência (135, 135)
Personagens (nodes) 135
Ocorrência (edges) 391


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.014925,1.000000,0.309856,0.012604,1.928269e-02,0.002020
1,0.014925,1.000000,0.351395,0.012604,3.038636e-02,0.001934
2,0.029851,1.000000,0.378848,0.012604,5.715524e-02,0.002672
3,0.014925,1.000000,0.321143,0.012604,1.948175e-02,0.002145
4,0.298507,0.196154,0.507775,0.125560,2.636092e-01,0.061206
...,...,...,...,...,...,...
130,0.014925,1.000000,0.320083,0.012604,2.095393e-02,0.001973
131,0.007463,0.000000,0.007463,0.000111,4.252654e-12,0.008237
132,0.022388,1.000000,0.322210,0.012604,1.809594e-02,0.003368
133,0.044776,0.933333,0.380334,0.012624,8.529076e-02,0.003314


Valores médios das medidas de rede:
 [0.04322830292979546, 0.5565284773989969, 0.30613137010127284, 0.01837888745572549, 0.05352377289883295, 0.007407407407407408]


Content of book ... 10_TheCountryOfThePointedFirs_SarahOrneJewett_cleared.txt
Número de parágrafos: 521
Número de parágrafos com personagens:  391
Matriz de Co-ocorrência (105, 105)
Personagens (nodes) 105
Ocorrência (edges) 191


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.028846,0.666667,0.240741,0.014652,6.459787e-03,0.011852
1,0.009615,0.000000,0.183962,0.014286,6.000157e-04,0.004413
2,0.019231,1.000000,0.317935,0.014286,6.670938e-02,0.003701
3,0.038462,1.000000,0.325000,0.014286,8.686092e-02,0.005982
4,0.048077,0.200000,0.342105,0.106777,5.368971e-02,0.016583
...,...,...,...,...,...,...
100,0.019231,1.000000,0.317935,0.014286,5.695364e-02,0.004570
101,0.019231,1.000000,0.317935,0.014286,6.335663e-02,0.003814
102,0.009615,0.000000,0.195000,0.014286,3.120894e-03,0.003824
103,0.009615,0.000000,0.009615,0.000183,3.049308e-14,0.010455


Valores médios das medidas de rede:
 [0.03498168498168498, 0.5024651070280822, 0.23250836041849193, 0.019066806209663348, 0.06223486306783895, 0.009523809523809526]


Content of book ... 11_TheWellAtTheWorldsEnd_WilliamMorris_cleared.txt
Número de parágrafos: 2782
Número de parágrafos com personagens:  2363
Matriz de Co-ocorrência (373, 373)
Personagens (nodes) 373
Ocorrência (edges) 1154


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.002688,0.000000,0.258967,0.004483,1.488991e-03,0.000733
1,0.016129,1.000000,0.366201,0.004483,2.642815e-02,0.001528
2,0.005376,1.000000,0.363640,0.004483,1.945020e-02,0.000739
3,0.005376,1.000000,0.369322,0.004483,2.517612e-02,0.000731
4,0.005376,1.000000,0.362121,0.004483,1.695022e-02,0.001032
...,...,...,...,...,...,...
368,0.000000,0.000000,0.000000,0.000000,8.284502e-26,0.000460
369,0.013441,1.000000,0.370903,0.004483,3.337877e-02,0.001198
370,0.008065,0.666667,0.367755,0.004537,2.440025e-02,0.001107
371,0.005376,1.000000,0.274844,0.004483,4.145854e-03,0.001067


Valores médios das medidas de rede:
 [0.01663351494710139, 0.5724707233882484, 0.29236434674624184, 0.006464913779638406, 0.03034141535813634, 0.002680965147453083]


Content of book ... 12_AChildOfTheJago_ArthurMorrison_cleared.txt
Número de parágrafos: 830
Número de parágrafos com personagens:  592
Matriz de Co-ocorrência (174, 174)
Personagens (nodes) 174
Ocorrência (edges) 647


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.028902,1.000000,0.308194,0.009966,1.866578e-02,0.003326
1,0.005780,0.000000,0.290957,0.009966,9.049692e-03,0.001311
2,0.005780,0.000000,0.336067,0.009966,1.562575e-02,0.001313
3,0.011561,1.000000,0.294248,0.009966,7.412462e-03,0.002437
4,0.005780,0.000000,0.336067,0.009966,1.562575e-02,0.001313
...,...,...,...,...,...,...
169,0.000000,0.000000,0.000000,0.000000,3.005292e-17,0.000940
170,0.011561,1.000000,0.292923,0.009966,8.128568e-03,0.001821
171,0.017341,1.000000,0.340465,0.009966,2.981340e-02,0.002145
172,0.057803,0.733333,0.401413,0.010449,8.755749e-02,0.005106


Valores médios das medidas de rede:
 [0.042987176931765333, 0.5629957740417804, 0.3083188238604817, 0.015135130594293576, 0.049490766838867314, 0.005747126436781608]


Content of book ... 13_TomSawyerDetective_MarkTwain_cleared.txt
Número de parágrafos: 517
Número de parágrafos com personagens:  457
Matriz de Co-ocorrência (71, 71)
Personagens (nodes) 71
Ocorrência (edges) 183


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.000000,0.000000,0.000000,0.000000,1.067177e-16,0.002575
1,0.300000,0.300000,0.411565,0.074821,3.139012e-01,0.053585
2,0.000000,0.000000,0.000000,0.000000,1.067177e-16,0.002575
3,0.000000,0.000000,0.000000,0.000000,1.067177e-16,0.002575
4,0.014286,0.000000,0.310894,0.022133,3.537963e-02,0.003829
...,...,...,...,...,...,...
66,0.128571,0.500000,0.313147,0.033732,1.276762e-01,0.018155
67,0.028571,1.000000,0.317752,0.022133,5.602805e-02,0.005107
68,0.214286,0.342857,0.389318,0.048023,2.315537e-01,0.030059
69,0.285714,0.310526,0.396461,0.064380,2.855108e-01,0.057211


Valores médios das medidas de rede:
 [0.07364185110663984, 0.4618088782562466, 0.24872178349439916, 0.031569699889477706, 0.0802438798403297, 0.014084507042253521]


Content of book ... 14_InHisSteps_CharlesMSheldon_cleared.txt
Número de parágrafos: 1259
Número de parágrafos com personagens:  1046
Matriz de Co-ocorrência (159, 159)
Personagens (nodes) 159
Ocorrência (edges) 456


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.006329,0.000000,0.248595,0.010748,0.008315,0.001525
1,0.012658,1.000000,0.337275,0.010748,0.025801,0.002075
2,0.259494,0.214634,0.480617,0.117908,0.282486,0.053587
3,0.006329,0.000000,0.238323,0.010748,0.002077,0.001572
4,0.094937,0.552381,0.410491,0.017776,0.162731,0.011510
...,...,...,...,...,...,...
154,0.018987,0.333333,0.293507,0.012273,0.015947,0.003261
155,0.056962,0.694444,0.379435,0.011312,0.104063,0.005130
156,0.025316,0.333333,0.310911,0.011971,0.019811,0.003544
157,0.012658,1.000000,0.263352,0.010748,0.004678,0.003275


Valores médios das medidas de rede:
 [0.036302842130403634, 0.5265451407635013, 0.28193615119943627, 0.016823561397737528, 0.047069975531978, 0.006289308176100628]


Content of book ... 15_HistoryOfTheWarfareOfScienceWithTheologyInChristendom_AndrewDicksonWhite_cleared.txt
Número de parágrafos: 3436
Número de parágrafos com personagens:  1755
Matriz de Co-ocorrência (1366, 1366)
Personagens (nodes) 1366
Ocorrência (edges) 3271


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.002198,1.000000,0.128861,0.000995,7.080909e-05,0.000600
1,0.000733,0.000000,0.000733,0.000001,1.241467e-18,0.000831
2,0.001465,1.000000,0.001465,0.000002,8.154463e-16,0.000831
3,0.005861,0.357143,0.209463,0.001398,3.504569e-02,0.000922
4,0.001465,1.000000,0.122625,0.000995,5.235913e-05,0.000490
...,...,...,...,...,...,...
1361,0.002198,1.000000,0.002344,0.000004,1.179828e-13,0.000867
1362,0.002930,1.000000,0.002930,0.000004,2.890516e-12,0.000831
1363,0.000000,0.000000,0.000000,0.000000,1.894329e-23,0.000125
1364,0.001465,1.000000,0.184637,0.000995,1.021286e-02,0.000370


Valores médios das medidas de rede:
 [0.003508546114695456, 0.5789244285390232, 0.11833264718533619, 0.0017047856139453153, 0.010189588753023178, 0.0007320644216691069]


Content of book ... 16_TheWhirlpool_GeorgeGissing_cleared.txt
Número de parágrafos: 3900
Número de parágrafos com personagens:  3597
Matriz de Co-ocorrência (322, 322)
Personagens (nodes) 322
Ocorrência (edges) 636


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.018692,0.666667,0.269894,0.005175,7.823283e-02,0.003386
1,0.000000,0.000000,0.000000,0.000000,6.181690e-20,0.000595
2,0.006231,1.000000,0.207111,0.004315,1.370461e-02,0.001373
3,0.000000,0.000000,0.000000,0.000000,6.181690e-20,0.000595
4,0.024922,0.892857,0.285302,0.004565,1.151029e-01,0.003661
...,...,...,...,...,...,...
317,0.000000,0.000000,0.000000,0.000000,6.181690e-20,0.000595
318,0.000000,0.000000,0.000000,0.000000,6.181690e-20,0.000595
319,0.003115,0.000000,0.003115,0.000019,2.025616e-15,0.003964
320,0.012461,1.000000,0.226159,0.004315,2.115226e-02,0.002200


Valores médios das medidas de rede:
 [0.012306263423695361, 0.3907688320766662, 0.1605813824248553, 0.006140752857862752, 0.028294404545159024, 0.003105590062111801]


Content of book ... 17_TheBethBook_SarahGrand_cleared.txt
Número de parágrafos: 4706
Número de parágrafos com personagens:  4239
Matriz de Co-ocorrência (433, 433)
Personagens (nodes) 433
Ocorrência (edges) 1144


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.006944,0.666667,0.382347,0.005567,2.577840e-02,0.001078
1,0.013889,1.000000,0.390533,0.003913,5.260564e-02,0.001150
2,0.004630,1.000000,0.384242,0.003913,3.170775e-02,0.001033
3,0.002315,0.000000,0.380938,0.003913,2.381429e-02,0.000515
4,0.011574,1.000000,0.383767,0.003913,3.164836e-02,0.001204
...,...,...,...,...,...,...
428,0.000000,0.000000,0.000000,0.000000,4.625598e-36,0.000379
429,0.018519,1.000000,0.396020,0.003913,6.769365e-02,0.001423
430,0.016204,0.619048,0.396020,0.004710,5.681169e-02,0.002527
431,0.006944,0.666667,0.381876,0.003919,2.732529e-02,0.001373


Valores médios das medidas de rede:
 [0.01223163116927551, 0.5802103382170358, 0.30905449676393726, 0.005602205205915289, 0.030213100751533772, 0.0023094688221709002]


Content of book ... 18_TheSpoilsOfPoynton_HenryJames_cleared__.txt
Número de parágrafos: 973
Número de parágrafos com personagens:  840
Matriz de Co-ocorrência (52, 52)
Personagens (nodes) 52
Ocorrência (edges) 153


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.098039,1.0,0.500834,0.036953,0.1250611,0.007007
1,0.039216,1.0,0.439985,0.036953,0.03503581,0.006602
2,0.058824,1.0,0.480392,0.036953,0.09076964,0.005356
3,0.098039,1.0,0.4904,0.036953,0.1218969,0.006936
4,0.039216,1.0,0.444136,0.036953,0.04435725,0.004676
5,0.411765,0.319048,0.611408,0.107746,0.2920806,0.08798
6,0.490196,0.246667,0.64491,0.141288,0.3174841,0.11774
7,0.039216,1.0,0.444136,0.036953,0.05862847,0.004543
8,0.176471,0.722222,0.528971,0.039945,0.1803391,0.023139
9,0.078431,1.0,0.485345,0.036953,0.1068159,0.006141


Valores médios das medidas de rede:
 [0.11538461538461539, 0.7224147823641751, 0.46888116893185133, 0.053544494720965306, 0.10645253912621938, 0.019230769230769235]


Content of book ... 19_TheBeetle_RichardMarsh_cleared.txt
Número de parágrafos: 3272
Número de parágrafos com personagens:  2801
Matriz de Co-ocorrência (196, 196)
Personagens (nodes) 196
Ocorrência (edges) 129


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.000000,0.0,0.000000,0.000000,2.076988e-34,0.001264
1,0.025641,0.3,0.116334,0.008074,1.420050e-01,0.012097
2,0.000000,0.0,0.000000,0.000000,2.076988e-34,0.001264
3,0.000000,0.0,0.000000,0.000000,2.076988e-34,0.001264
4,0.005128,0.0,0.005128,0.000052,2.283673e-22,0.008428
...,...,...,...,...,...,...
191,0.005128,0.0,0.005128,0.000052,2.283673e-22,0.008428
192,0.010256,1.0,0.104701,0.003663,9.126685e-02,0.006091
193,0.000000,0.0,0.000000,0.000000,2.076988e-34,0.001264
194,0.010256,1.0,0.104701,0.003663,9.126685e-02,0.006091


Valores médios das medidas de rede:
 [0.006750392464678179, 0.16913143247729714, 0.040628431744358456, 0.0029304029304029304, 0.03193196304774772, 0.005102040816326529]


Content of book ... 20_AddedUpon_NephiAnderson_cleared.txt
Número de parágrafos: 1405
Número de parágrafos com personagens:  1247
Matriz de Co-ocorrência (185, 185)
Personagens (nodes) 185
Ocorrência (edges) 229


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.010870,1.000000,0.012228,0.000176,9.589875e-09,0.006552
1,0.010870,1.000000,0.196912,0.005875,5.755728e-02,0.002847
2,0.000000,0.000000,0.000000,0.000000,9.140903e-16,0.000999
3,0.000000,0.000000,0.000000,0.000000,9.140903e-16,0.000999
4,0.103261,0.157895,0.255154,0.091771,2.500914e-01,0.031179
...,...,...,...,...,...,...
180,0.032609,0.200000,0.214814,0.030259,9.555167e-02,0.008194
181,0.010870,1.000000,0.175316,0.005875,5.742916e-02,0.002478
182,0.054348,0.333333,0.218264,0.014926,1.822909e-01,0.019185
183,0.005435,0.000000,0.125225,0.005875,4.849289e-03,0.002142


Valores médios das medidas de rede:
 [0.013454759106933018, 0.3820758995472954, 0.10050731630199997, 0.006659256200971829, 0.03542520659054092, 0.005405405405405406]


Content of book ... 21_ADigitOfTheMoon_FWBain_cleared.txt
Número de parágrafos: 246
Número de parágrafos com personagens:  143
Matriz de Co-ocorrência (65, 65)
Personagens (nodes) 65
Ocorrência (edges) 143


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.000000,0.000000,0.000000,0.000000,8.321321e-18,0.002540
1,0.031250,1.000000,0.041667,0.001923,1.352689e-09,0.012423
2,0.046875,0.666667,0.329590,0.027644,3.643609e-02,0.011918
3,0.000000,0.000000,0.000000,0.000000,8.321321e-18,0.002540
4,0.000000,0.000000,0.000000,0.000000,8.321321e-18,0.002540
...,...,...,...,...,...,...
60,0.015625,0.000000,0.015625,0.000481,2.726731e-13,0.016935
61,0.031250,1.000000,0.239702,0.021635,6.599570e-03,0.007368
62,0.046875,0.666667,0.232652,0.021875,6.620760e-03,0.015101
63,0.171875,1.000000,0.343920,0.021635,2.698911e-01,0.020480


Valores médios das medidas de rede:
 [0.06875, 0.6227869881544402, 0.22395063278920208, 0.025924556213017754, 0.06773899193664347, 0.015384615384615387]


Content of book ... 22_AManFromTheNorth_ArnoldBennett_cleared.txt
Número de parágrafos: 1226
Número de parágrafos com personagens:  1069
Matriz de Co-ocorrência (116, 116)
Personagens (nodes) 116
Ocorrência (edges) 214


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.191304,0.203463,0.434732,0.090977,2.678071e-01,0.052641
1,0.000000,0.000000,0.000000,0.000000,5.248278e-20,0.001490
2,0.017391,1.000000,0.363327,0.013943,7.221122e-02,0.003203
3,0.026087,1.000000,0.368670,0.013943,9.623736e-02,0.004032
4,0.017391,1.000000,0.244979,0.013943,4.517908e-03,0.005468
...,...,...,...,...,...,...
111,0.008696,0.000000,0.188967,0.013943,4.083237e-04,0.003956
112,0.017391,1.000000,0.343419,0.013943,5.237789e-02,0.004028
113,0.000000,0.000000,0.000000,0.000000,5.248278e-20,0.001490
114,0.017391,0.000000,0.273486,0.027736,1.684720e-02,0.005648


Valores médios das medidas de rede:
 [0.0320839580209895, 0.4623887287849067, 0.2673201033957423, 0.019973633872718815, 0.06024046370328538, 0.008620689655172414]


Content of book ... 23_TheUncalled_PaulLaurenceDunbar_cleared.txt
Número de parágrafos: 1031
Número de parágrafos com personagens:  909
Matriz de Co-ocorrência (109, 109)
Personagens (nodes) 109
Ocorrência (edges) 169


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.046296,1.000000,0.283261,0.012572,0.138396,0.009995
1,0.018519,0.000000,0.202007,0.016310,0.008818,0.006557
2,0.018519,1.000000,0.199621,0.012572,0.018720,0.005667
3,0.018519,1.000000,0.229429,0.012572,0.038340,0.004632
4,0.027778,0.333333,0.222385,0.024975,0.046429,0.009379
...,...,...,...,...,...,...
104,0.009259,0.000000,0.192059,0.012572,0.006136,0.004121
105,0.046296,0.900000,0.278592,0.013107,0.144914,0.011087
106,0.018519,1.000000,0.247335,0.012572,0.061189,0.004421
107,0.009259,0.000000,0.223364,0.012572,0.027732,0.003157


Valores médios das medidas de rede:
 [0.028712198436969073, 0.2675199727188833, 0.16515077176514692, 0.017179623862563012, 0.05236666337643007, 0.009174311926605505]


Content of book ... 24_TheSecondThoughtsOfAnIdleFellow_JeromeKJerome_cleared.txt
Número de parágrafos: 1071
Número de parágrafos com personagens:  918
Matriz de Co-ocorrência (192, 192)
Personagens (nodes) 192
Ocorrência (edges) 176


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.005236,0.0,0.015956,0.000436,2.627696e-10,0.003673
1,0.020942,1.0,0.036813,0.000818,8.540843e-06,0.007782
2,0.005236,0.0,0.005236,0.000055,3.497088e-17,0.007008
3,0.015707,1.0,0.044677,0.000873,7.789242e-02,0.005084
4,0.005236,0.0,0.005236,0.000055,3.497088e-17,0.007008
...,...,...,...,...,...,...
187,0.005236,0.0,0.018057,0.000709,8.143226e-11,0.003623
188,0.015707,1.0,0.044677,0.000873,7.789242e-02,0.005084
189,0.000000,0.0,0.000000,0.000000,5.211068e-25,0.001051
190,0.010471,1.0,0.010471,0.000109,1.324583e-12,0.007008


Valores médios das medidas de rede:
 [0.009598603839441536, 0.3546502976190476, 0.01479205847390626, 0.0004158486038394415, 0.01735424802836948, 0.005208333333333333]


Content of book ... 25_EvelynInnes_GeorgeMoore_cleared.txt
Número de parágrafos: 2606
Número de parágrafos com personagens:  2393
Matriz de Co-ocorrência (325, 325)
Personagens (nodes) 325
Ocorrência (edges) 924


Unnamed: 0,K,CC,CLC,B,EC,PR
0,0.012346,1.000000,0.340345,0.005147,3.417087e-02,0.001644
1,0.003086,0.000000,0.259645,0.005147,3.248691e-03,0.000850
2,0.006173,1.000000,0.362672,0.005147,3.724889e-02,0.001035
3,0.012346,1.000000,0.280185,0.005147,1.146688e-02,0.001743
4,0.049383,0.233333,0.374661,0.020370,5.921976e-02,0.006988
...,...,...,...,...,...,...
320,0.006173,1.000000,0.345007,0.005147,2.864784e-02,0.001027
321,0.000000,0.000000,0.000000,0.000000,9.442153e-26,0.000522
322,0.006173,1.000000,0.294759,0.005147,1.300717e-02,0.001334
323,0.006173,1.000000,0.255259,0.005147,2.127929e-03,0.001722


Valores médios das medidas de rede:
 [0.01754985754985755, 0.5501494088893214, 0.27818598805350414, 0.007696632332529768, 0.033613928157807706, 0.003076923076923076]


Content of book ... 26_TheGoldenCanyon_GAHenty_cleared.txt
Número de parágrafos: 1288
Número de parágrafos com personagens:  1114
Matriz de Co-ocorrência (64, 64)
Personagens (nodes) 64
Ocorrência (edges) 86


PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')

In [None]:
medidas_success = Medidas(folder="success")

# Classificação

In [None]:
# other - Others (1895 - 1923)
dfo = pd.read_csv('catalog_v0.xlsx - other.csv')
dfo['Classe'] = 0
dfo['Medidas'] = medidas_other
dfo.head()

In [None]:
# success- Publishers Weekly’s Bestselling Novels
dfs = pd.read_csv('catalog_v0.xlsx - success.csv')
dfs['Classe'] = 1
dfs['Medidas'] = medidas_success
dfs.head()

In [None]:
concat = pd.concat([dfs, dfo], axis=0)
concat = concat.sample(frac=1).reset_index(drop=True)
display(concat)

In [None]:
for i in range(len(concat)):
    name = ''.join(concat["Title"][i].split())
    index = concat["Index"][i]
    namesearch = str(index) + '_' + str(name)
    print(namesearch)

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter