In [1]:
import random
import pickle

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def write_data_to_disk(file, data):
    with open(file, 'wb') as fid:
        pickle.dump(data, fid)
        
def load_data_from_disk(file):
    with open(file, 'rb') as fid:
        data = pickle.load(fid)
    return data

# Criação de rede de co-ocorrência

In [3]:
def cooccurrence_matrix(data, entities_list):
    # Inicializa a matriz de coocorrência
    cooccurrence_matrix = np.zeros((len(entities_list), len(entities_list)), dtype=int)

    # Preenche a matriz de coocorrência
    for _, _, paragraph_entities in data:
        
        for i in range(len(entities_list)):
            if entities_list[i] in paragraph_entities:
                
                for j in range(i, len(entities_list)):
                    if entities_list[j] in paragraph_entities:
                        cooccurrence_matrix[i][j] += 1
                        if i != j:
                            cooccurrence_matrix[j][i] += 1
    return np.array(cooccurrence_matrix)

In [4]:
def edge_list(entities_list, co_matrix):
    edge_list = []

    for i in range(len(entities_list)):
        for j in range(i + 1, len(entities_list)):
            weight = co_matrix[i][j]
            if weight > 0:
                edge_list.append((entities_list[i], entities_list[j], {'weight': weight}))
    
    return edge_list

# Medidas de centralidade

In [5]:
def avg_medidas(G):
    
    K = dict(nx.degree_centrality(G)) # Centralidade de grau
    CC = dict(nx.clustering(G)) # Clustering coeficient
    CLC = dict(nx.closeness_centrality(G)) # Closeness centrality
    B = dict(nx.betweenness_centrality(G, endpoints=True)) # Betweenness centrality
    EC = dict(nx.eigenvector_centrality(G, max_iter=1000)) # Eigenvetor centrality
    PR = dict(nx.pagerank(G, alpha=0.85)) # PageRank
    
    df = pd.DataFrame({'K':list(K.values()),
                       'CC':list(CC.values()),
                       'CLC':list(CLC.values()),
                       'B':list(B.values()),
                       'EC':list(EC.values()),
                       'PR':list(PR.values())})
    # display(df)
    
    return list(df.mean()) # Vetor com valores médios das medidas de rede de co-ocorrência

## Visualização de rede

In [6]:
def plot_network_degree(G, name_list, folder, name, index):
    
    # Atribuir uma cor única a cada nó
    node_colors = range(len(name_list))

    # Identifica os 15 nós com maior grau
    degrees = dict(G.degree())
    top_degree_nodes = sorted(degrees, key=degrees.get, reverse=True)[:15]

    # Determina as arestas que conectam os nós de maior grau
    edges_of_top_degree_nodes = [(u, v) for u, v in G.edges() if u in top_degree_nodes and v in top_degree_nodes]

    # Posicionamento dos nós aleatório
    pos = nx.random_layout(G)

    plt.figure(figsize=(12, 12))

    # Desenha a rede
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edge_color="#E1E5EA", # "#DBDFEA"  # Cor das arestas
            with_labels=False,  # Não mostra rótulos dos nós
            alpha= 0.7,
           ) 

    # Adiciona as arestas dos nós de maior grau em uma cor diferente
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edgelist=edges_of_top_degree_nodes,
            edge_color='red',  # Cor das arestas dos nós de maior grau
            width=1.0,  # Largura das arestas
            alpha=0.2)  # Transparência das arestas


    for node in top_degree_nodes:
        x, y = pos[node]
        plt.annotate(node, (x, y), ha='center', va='center', fontsize=10,
                     # fontweight='heavy',
                     color='#461959', #B80D57 #99235C
                     backgroundcolor='white',
                     bbox=dict(boxstyle='round,pad=0.2', edgecolor='#D3CEDF', facecolor='#F9F5F6'))
    
    legend_text = f"Personagens (nós): {G.number_of_nodes()}\nOcorrência (arestas): {G.number_of_edges()}"
    plt.figtext(0.1, 0.94, legend_text, fontsize=11, color='#461959', 
                bbox=dict(facecolor='white', edgecolor='#D3CEDF', boxstyle='round,pad=0.6'))

    plt.title(name , fontsize=18, style='italic', color='#461959', va='baseline')
    plt.axis('off')
    plt.savefig("graphs/graph_" + folder + "_" + str(index) + '.png', dpi=300, bbox_inches='tight')
    plt.show()

In [17]:
def graph_medidas(folder, df):
    
    M = list()
    for i in range(len(df)):
        index = df["Index"][i]
        name = df["Title"][i] + '\n' + df['Author'][i]

        data = load_data_from_disk('dados/data_' + folder + '_' + str(index)+ '.pk')
        entities_list = load_data_from_disk('dados/entities_' + folder + '_' + str(index)+ '.pk')

        # Matriz de coocorrência
        co_matrix = cooccurrence_matrix(data, entities_list)
        print("\nMatriz de Co-ocorrência", co_matrix.shape)

        # Lista de arestas
        edgelist = edge_list(entities_list, co_matrix)

        # Rede
        G = nx.Graph()
        G.add_nodes_from(entities_list)
        G.add_edges_from(edgelist)

        # Visualização de rede - 15 personagens com maior grau
        plot_network_degree(G, entities_list, folder, name, index)

        medidas_rede = avg_medidas(G)
        print('Valores médios das medidas de rede:\n', medidas_rede)
        
        M.append(medidas_rede)
    
    return M    

In [9]:
# success- Publishers Weekly’s Bestselling Novels
dfs = pd.read_csv('catalog_v0.xlsx - success.csv')
medidas_success = graph_medidas(folder="success", df=dfs)


Matriz de Co-ocorrência (304, 304)
Valores médios das medidas de rede:
 [0.016870766023970817, 0.5134629202554766, 0.21447187205472734, 0.008661152212866716, 0.032733946916331996, 0.0032894736842105257]

Matriz de Co-ocorrência (509, 509)
Valores médios das medidas de rede:
 [0.007943628853858884, 0.5080680299438605, 0.15548366072042627, 0.003953282843747495, 0.02121081934896206, 0.0019646365422396855]

Matriz de Co-ocorrência (105, 105)
Valores médios das medidas de rede:
 [0.04249084249084249, 0.5155886357461207, 0.27463776258006106, 0.02413570556427699, 0.06300304742136524, 0.009523809523809526]

Matriz de Co-ocorrência (589, 589)
Valores médios das medidas de rede:
 [0.008818128269983714, 0.4963272963753215, 0.24490835728336366, 0.004243344860002338, 0.02203517937875317, 0.0016977928692699486]

Matriz de Co-ocorrência (54, 54)
Valores médios das medidas de rede:
 [0.07058001397624039, 0.5021140306022375, 0.35857042704233216, 0.04410280301265626, 0.10090652512119298, 0.018518518518

Valores médios das medidas de rede:
 [0.009857564703838802, 0.3573122478260191, 0.13542627284871953, 0.006412459545816079, 0.029901310562325444, 0.003289473684210526]

Matriz de Co-ocorrência (78, 78)
Valores médios das medidas de rede:
 [0.051282051282051294, 0.4771672686860827, 0.26350981242547905, 0.026537564999103466, 0.07740333988953212, 0.012820512820512818]

Matriz de Co-ocorrência (247, 247)
Valores médios das medidas de rede:
 [0.018465488298607682, 0.4834406276632991, 0.24082192269291772, 0.011157490158389263, 0.03654289971082173, 0.004048582995951418]

Matriz de Co-ocorrência (95, 95)
Valores médios das medidas de rede:
 [0.04412094064949607, 0.412953459456892, 0.2063326290262162, 0.019206695349796663, 0.059441161955712335, 0.010526315789473682]

Matriz de Co-ocorrência (160, 160)
Valores médios das medidas de rede:
 [0.04457547169811321, 0.5893374616458715, 0.28686450530921004, 0.01577879323899371, 0.04804181464712669, 0.00625]

Matriz de Co-ocorrência (163, 163)
Valores mé

Valores médios das medidas de rede:
 [0.022627490712597093, 0.45031772834720746, 0.23361269458866962, 0.011677974753512295, 0.04262063878879615, 0.005291005291005291]

Matriz de Co-ocorrência (192, 192)
Valores médios das medidas de rede:
 [0.017397469458987787, 0.4492056445378146, 0.22431665932618627, 0.010931932627981386, 0.04283542390039972, 0.005208333333333333]

Matriz de Co-ocorrência (190, 190)
Valores médios das medidas de rede:
 [0.01035923141186299, 0.3153159395564653, 0.07531062888053074, 0.004128742909906345, 0.03329787178885802, 0.005263157894736842]

Matriz de Co-ocorrência (288, 288)
Valores médios das medidas de rede:
 [0.01887340301974448, 0.5327149488755332, 0.29892512760397194, 0.00923699496171549, 0.03787610260937779, 0.003472222222222222]

Matriz de Co-ocorrência (371, 371)
Valores médios das medidas de rede:
 [0.011218765935747069, 0.5137506609919132, 0.1780918582026321, 0.004735546860349789, 0.028377922107432587, 0.0026954177897574125]

Matriz de Co-ocorrência (3

In [10]:
# other - Others (1895 - 1923)
dfo = pd.read_csv('catalog_v0.xlsx - other.csv')
medidas_other = graph_medidas(folder="other", df=dfo)


Matriz de Co-ocorrência (74, 74)
Valores médios das medidas de rede:
 [0.028878193261754902, 0.40067567567567564, 0.05962505572492134, 0.005573511312126641, 0.044331905019756486, 0.013513513513513516]

Matriz de Co-ocorrência (240, 240)
Valores médios das medidas de rede:
 [0.011366806136680613, 0.38194760572684516, 0.11725293584819303, 0.005031090190609018, 0.032013309253143836, 0.004166666666666667]

Matriz de Co-ocorrência (121, 121)
Valores médios das medidas de rede:
 [0.02933884297520661, 0.6008758236577878, 0.23850200101522243, 0.02002026273250916, 0.05629620395833612, 0.008264462809917354]

Matriz de Co-ocorrência (79, 79)
Valores médios das medidas de rede:
 [0.07530022719896137, 0.6381295271397521, 0.3527716652346722, 0.02965911938832945, 0.08078913275670614, 0.012658227848101262]

Matriz de Co-ocorrência (68, 68)
Valores médios das medidas de rede:
 [0.06321334503950835, 0.5063739977294617, 0.27908263576344566, 0.03098047823167897, 0.08115640735166518, 0.01470588235294118]



Matriz de Co-ocorrência (267, 267)
Valores médios das medidas de rede:
 [0.013122694376390413, 0.4138564852785064, 0.14702574478970554, 0.00635462746838902, 0.031587181712077694, 0.003745318352059924]

Matriz de Co-ocorrência (160, 160)
Valores médios das medidas de rede:
 [0.03176100628930818, 0.5240271379330762, 0.2328518240632242, 0.011882370283018867, 0.04704095451847962, 0.00625]

Matriz de Co-ocorrência (62, 62)
Valores médios das medidas de rede:
 [0.08038075092543627, 0.6086352649512909, 0.38827779167607085, 0.04943620886712954, 0.09415035899231365, 0.016129032258064512]

Matriz de Co-ocorrência (191, 191)
Valores médios das medidas de rede:
 [0.022981537613667676, 0.4331414656073926, 0.1809828618629723, 0.0097553304604127, 0.041130019058958654, 0.005235602094240839]

Matriz de Co-ocorrência (433, 433)
Valores médios das medidas de rede:
 [0.010638525361389102, 0.4424000841586607, 0.16222310917543223, 0.004318500265595829, 0.023925439238952, 0.0023094688221709007]

Matriz de C


Matriz de Co-ocorrência (357, 357)
Valores médios das medidas de rede:
 [0.014241651716866524, 0.5098635600127654, 0.2299606232102429, 0.006511615441505972, 0.02814885324886534, 0.0028011204481792717]

Matriz de Co-ocorrência (233, 233)
Valores médios das medidas de rede:
 [0.020053278081989048, 0.5128768972152966, 0.30164687856151495, 0.01033580816841667, 0.04288424249163645, 0.0042918454935622335]

Matriz de Co-ocorrência (225, 225)
Valores médios das medidas de rede:
 [0.022023809523809522, 0.4801527807713972, 0.22198739733797446, 0.009997178130511462, 0.037580998105444616, 0.0044444444444444444]

Matriz de Co-ocorrência (8, 8)
Valores médios das medidas de rede:
 [0.25, 0.2583333333333333, 0.46948051948051944, 0.24999999999999997, 0.30583712288630976, 0.125]

Matriz de Co-ocorrência (203, 203)
Valores médios das medidas de rede:
 [0.02506950202409404, 0.5555258175494203, 0.2579236637987608, 0.011805313123707235, 0.04296780753025107, 0.0049261083743842365]

Matriz de Co-ocorrência 

## Unificação dos livros

In [11]:
dfs['Classe'] = 1
dfs['Medidas'] = medidas_success
dfs.head()

Unnamed: 0,Title,Author,Year,Index,Classe,Medidas
0,Beside the Bonnie Brier Bush,Ian Maclaren,1895,1,1,"[0.016870766023970817, 0.5134629202554766, 0.2..."
1,Trilby,George Du Maurier,1895,2,1,"[0.007943628853858884, 0.5080680299438605, 0.1..."
2,The Adventures of Captain Horn,Frank Richard Stockton,1895,3,1,"[0.04249084249084249, 0.5155886357461207, 0.27..."
3,The Manxman,Hall Caine,1895,4,1,"[0.008818128269983714, 0.4963272963753215, 0.2..."
4,The Princess Aline,Richard Harding Davis,1895,5,1,"[0.07058001397624039, 0.5021140306022375, 0.35..."


In [12]:
dfo['Classe'] = 0
dfo["Medidas"] = medidas_other
dfo.head()

Unnamed: 0,Title,Author,Year,Index,Classe,Medidas
0,In the Land Of Cave And Cliff Dwellers,Frederick Schwatka,1895,1,0,"[0.028878193261754902, 0.40067567567567564, 0...."
1,Jude the Obscure,Thomas Hardy,1895,2,0,"[0.011366806136680613, 0.38194760572684516, 0...."
2,The Golden Age,Kenneth Grahame,1895,3,0,"[0.02933884297520661, 0.6008758236577878, 0.23..."
3,The Lost Stradivarius,John Meade Falkner,1895,4,0,"[0.07530022719896137, 0.6381295271397521, 0.35..."
4,The British Barbarians,Grant Allen,1895,5,0,"[0.06321334503950835, 0.5063739977294617, 0.27..."


In [15]:
# dataframe que contem toda a info dos livros success e other
concat = pd.concat([dfs, dfo], axis=0)

# guarda o dataframe concat num arquivo .pk
write_data_to_disk('all_books.pk', concat)

In [16]:
concat = concat.sample(frac=1).reset_index(drop=True)
display(concat)

Unnamed: 0,Title,Author,Year,Index,Classe,Medidas
0,Sir Richard Calmady,Lucas Malet,1902,49,1,"[0.014304644999302554, 0.5768316939189352, 0.2..."
1,The Promised Land,Mary Antin,1912,77,0,"[0.014399851356373096, 0.4644561507333247, 0.0..."
2,The River's End,James Oliver Curwood,1920,100,1,"[0.05844376486578322, 0.6602264919703563, 0.35..."
3,The Beasts of Tarzan,Edgar Rice Burroughs,1916,90,0,"[0.04137447405329593, 0.3751048164710404, 0.26..."
4,The Hill of Dreams,Arthur Machen,1907,64,0,"[0.03292836196062003, 0.6018731615223741, 0.15..."
...,...,...,...,...,...,...
214,The Inner Shrine,Basil King,1909,69,1,"[0.027381887554707662, 0.3899323693781564, 0.1..."
215,Father and Son,Edmund Gosse,1907,65,0,"[0.010582010582010581, 0.37054109911252775, 0...."
216,Hugh Wynne,Silas Weir Mitchell,1898,22,1,"[0.010592361094543978, 0.5169836241107117, 0.2..."
217,The Pride of Jennico,Egerton Castle,1898,25,1,"[0.02137384056996908, 0.45606070664686443, 0.1..."
