In [1]:
import os
import io
import re
import multiprocessing
import pickle

In [2]:
import neuralcoref
import spacy

print('spacy version:', spacy.__version__)
print('neuralcoref version:', neuralcoref.__version__)

spacy version: 2.1.3
neuralcoref version: 4.0.0


In [3]:
def write_data_to_disk(file, data):
    with open(file, 'wb') as fid:
        pickle.dump(data, fid)

def load_data_from_disk(file):
    with open(file, 'rb') as fid:
        data = pickle.load(fid)
    return data

# Reconhecimento de entidades

In [4]:
def read_file(path):
    try:
        with io.open(path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        try:
            with io.open(path, 'r', encoding='latin-1') as f:
                return f.read()
        except UnicodeDecodeError:
            return "Could not decode"

In [5]:
def extract_paragraphs(folder, file):
    path = os.path.join(folder, file) # name file

    if os.path.isfile(path):
        content = read_file(path)
        print(f"Content of book ... {file}")

        paragraphs = content.split('\n\n')

        return paragraphs
    return []

In [6]:
def new_paragraphs(paragraphs):
    paragraphs_new = []
    contents = []
    chapters = []
    old_idx = 0

    for paragraph in paragraphs:
        lines = paragraph.split('\n')

        if re.match('[IVXLCDM\d]+[\.]*', lines[0].upper().strip()):
            if not contents:
                contents = [" ".join(line.lower().split()) for line in lines]
                
                if not contents:
                    print("Not found content", id_file)
                    exit()
            continue

        if paragraph.lower().strip() in contents:
            if old_idx == 0:
                old_idx = len(paragraphs_new)
            else:
                current_idx = len(paragraphs_new)
                chapter = "\n\n".join(paragraphs_new[old_idx:current_idx])
                chapters.append(chapter)
                old_idx = current_idx
            continue

        if not re.match('.*[\w]+.*', paragraph):
            continue

        paragraphs_new.append(paragraph)

        if old_idx != 0:
            chapter = "\n\n".join(paragraphs_new[old_idx:])
            chapters.append(chapter)
        
    return paragraphs_new

In [7]:
def process_paragraph(paragraph, nlp):
    paragraph = paragraph.replace('\n', ' ')
    paragraph = " ".join(paragraph.split())
    doc = nlp(paragraph)
    doc_coref = doc._.coref_resolved
    doc_coref = nlp(doc_coref)

    entities = [ent.text for ent in doc_coref.ents if ent.label_ == "PERSON"]

    return entities

def extract_entities_from_paragraphs(paragraphs_to_process, nlp):
    """
    
    return:
    
    data: 
        (num paragraph, len_words, list_entities)
        
        [(5, 79, ['Kidd’s', 'Whoso']), ...]
    """
    
    data = []
    geral_entities = set()

    with multiprocessing.Pool() as pool:
        results = pool.starmap(process_paragraph, [(p, nlp) for p in paragraphs_to_process])

    for i, entities in enumerate(results):
        if entities:
            len_words = len(paragraphs_to_process[i].split())
            geral_entities.update(entities) # adiciona entities únicas
            data.append((i, len_words, entities))
            # print(i, len_words, entities)

    return data, list(geral_entities)

In [8]:
def all_entities(folder, file):     
    paragraphs = extract_paragraphs(folder, file)
    paragraphs_new = new_paragraphs(paragraphs)
    
    print('Número de parágrafos:', len(paragraphs))
    print('Número de parágrafos com personagens: ', len(paragraphs_new))     
    
    nlp = spacy.load("en_core_web_sm")
    neuralcoref.add_to_pipe(nlp)
    
    data, geral_entities = extract_entities_from_paragraphs(paragraphs_new, nlp)
    
    return data, geral_entities

In [18]:
def pk_data_entities(folder, df):
    
    files_ordered = sorted(os.listdir(folder), key=lambda name:int(name.split('_')[0]))
    
    for i in range(len(files_ordered)):
        file = files_ordered[i]
        print('\n')
        
        data, geral_entities = all_entities(folder, file)
        
        index = df["Index"][i]
        
        write_data_to_disk('dados/data_' + folder + '_' + str(index)+ '.pk', data)
        write_data_to_disk('dados/entities_' + folder + '_' + str(index)+ '.pk', geral_entities)

# Criação de rede de co-ocorrência

In [17]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [10]:
def cooccurrence_matrix(data, entities_list):
    # Inicializa a matriz de coocorrência
    cooccurrence_matrix = np.zeros((len(entities_list), len(entities_list)), dtype=int)

    # Preenche a matriz de coocorrência
    for _, _, paragraph_entities in data:
        
        for i in range(len(entities_list)):
            if entities_list[i] in paragraph_entities:
                
                for j in range(i, len(entities_list)):
                    if entities_list[j] in paragraph_entities:
                        cooccurrence_matrix[i][j] += 1
                        if i != j:
                            cooccurrence_matrix[j][i] += 1
    return np.array(cooccurrence_matrix)

In [11]:
def edge_list(entities_list, co_matrix):
    edge_list = []

    for i in range(len(entities_list)):
        for j in range(i + 1, len(entities_list)):
            weight = co_matrix[i][j]
            if weight > 0:
                edge_list.append((entities_list[i], entities_list[j], {'weight': weight}))
    
    return edge_list

In [None]:
def plot_network_degree(G, name_list, folder, name, index):
    
    # Atribuir uma cor única a cada nó
    node_colors = range(len(name_list))

    # Identifica os 15 nós com maior grau
    degrees = dict(G.degree())
    top_degree_nodes = sorted(degrees, key=degrees.get, reverse=True)[:15]

    # Determina as arestas que conectam os nós de maior grau
    edges_of_top_degree_nodes = [(u, v) for u, v in G.edges() if u in top_degree_nodes and v in top_degree_nodes]

    # Posicionamento dos nós aleatório
    pos = nx.random_layout(G)

    plt.figure(figsize=(12, 12))

    # Desenha a rede
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edge_color="#E1E5EA", # "#DBDFEA"  # Cor das arestas
            with_labels=False,  # Não mostra rótulos dos nós
            alpha= 0.7,
           ) 

    # Adiciona as arestas dos nós de maior grau em uma cor diferente
    nx.draw(G,
            pos=pos,
            node_color=node_colors,
            cmap=plt.get_cmap('OrRd_r'),  # mapa de cores
            node_size=35,  # Tamanho dos nós
            edgelist=edges_of_top_degree_nodes,
            edge_color='red',  # Cor das arestas dos nós de maior grau
            width=1.0,  # Largura das arestas
            alpha=0.2)  # Transparência das arestas


    for node in top_degree_nodes:
        x, y = pos[node]
        plt.annotate(node, (x, y), ha='center', va='center', fontsize=10,
                     # fontweight='heavy',
                     color='#461959', #B80D57 #99235C
                     backgroundcolor='white',
                     bbox=dict(boxstyle='round,pad=0.2', edgecolor='#D3CEDF', facecolor='#F9F5F6'))
    
    legend_text = f"Personagens (nós): {G.number_of_nodes()}\nOcorrência (arestas): {G.number_of_edges()}"
    plt.figtext(0.1, 0.94, legend_text, fontsize=11, color='#461959', 
                bbox=dict(facecolor='white', edgecolor='#D3CEDF', boxstyle='round,pad=0.6'))

    plt.title(name , fontsize=18, style='italic', color='#461959', va='baseline')
    plt.axis('off')
    plt.savefig("graphs/graph_" + folder + "_" + str(index) + '.png', dpi=300, bbox_inches='tight')
    plt.show()

# Medidas de centralidade

In [None]:
def medidas(G):
    
    K = dict(nx.degree_centrality(G)) # Centralidade de grau
    CC = dict(nx.clustering(G)) # Clustering coeficient
    CLC = dict(nx.closeness_centrality(G)) # Closeness centrality
    B = dict(nx.betweenness_centrality(G, endpoints=True)) # Betweenness centrality
    EC = dict(nx.eigenvector_centrality(G, max_iter=1000)) # Eigenvetor centrality
    PR = dict(nx.pagerank(G, alpha=0.85)) # PageRank
    
    
    df = pd.DataFrame({'K':list(K.values()),
                       'CC':list(CC.values()),
                       'CLC':list(CLC.values()),
                       'B':list(B.values()),
                       'EC':list(EC.values()),
                       'PR':list(PR.values())})
    display(df)
    
    # Vetor com valores médios das medidas de rede de co-ocorrência
    medidas = list(df.mean())
    
    return medidas

In [None]:
dfs = pd.read_csv('catalog_v0.xlsx - success.csv')

pk_data_entities(folder="success", df=dfs)



Content of book ... 1_BesideTheBonnieBrierBush_IanMaclaren_cleared.txt
Número de parágrafos: 1358
Número de parágrafos com personagens:  1158


Content of book ... 2_Trilby_GeorgeDuMaurier_cleared.txt
Número de parágrafos: 2407
Número de parágrafos com personagens:  1987


Content of book ... 3_TheAdventuresOfCaptainHorn_FrankRichardStockton_cleared.txt
Número de parágrafos: 1945
Número de parágrafos com personagens:  1608


Content of book ... 4_TheManxman_HallCaine_cleared.txt
Número de parágrafos: 5701
Número de parágrafos com personagens:  5227


Content of book ... 5_ThePrincessAline_RichardHardingDavis_cleared.txt
Número de parágrafos: 431
Número de parágrafos com personagens:  357


Content of book ... 6_TheMaster_IsraelZangwill_cleared.txt
Número de parágrafos: 3962
Número de parágrafos com personagens:  3511


Content of book ... 7_ThePrisonerOfZenda_AnthonyHope_cleared.txt
Número de parágrafos: 1762
Número de parágrafos com personagens:  1538


Content of book ... 8_Degener



Content of book ... 60_ThePrincessPasses_CNWilliamson_cleared.txt
Número de parágrafos: 2064
Número de parágrafos com personagens:  1688


Content of book ... 61_TheFightingChance_RobertWChambers_cleared.txt
Número de parágrafos: 4113
Número de parágrafos com personagens:  3865


Content of book ... 62_TheHouseOfAThousandCandles_MeredithNicholson_cleared.txt
Número de parágrafos: 2334
Número de parágrafos com personagens:  1983


Content of book ... 63_TheJungle_UptonSinclair_cleared.txt
Número de parágrafos: 1525
Número de parágrafos com personagens:  1289


Content of book ... 64_TheAwakeningOfHelenaRichie_MargaretDeland_cleared.txt
Número de parágrafos: 2595
Número de parágrafos com personagens:  2278


Content of book ... 65_TheSpoilers_RexBeach_cleared.txt
Número de parágrafos: 1997
Número de parágrafos com personagens:  1827


Content of book ... 66_TheBrassBowl_LouisJosephVance_cleared.txt
Número de parágrafos: 1937
Número de parágrafos com personagens:  1721


Content of book

In [None]:
dfo = pd.read_csv('catalog_v0.xlsx - other.csv')

pk_data_entities(folder="other", df=dfo)

In [None]:
def medidas_folder(folder, df):
    
    M = list()
    for i in range(len(df)):

        index = df["Index"][i]
        name = df["Title"][i] + '\n' + df['Author'][i]

        data = load_data_from_disk('dados/data_' + folder + '_' + str(index)+ '.pk')
        entities_list = load_data_from_disk('dados/entities_' + folder + '_' + str(index)+ '.pk')

        # Matriz de coocorrência
        co_matrix = cooccurrence_matrix(data, entities_list)
        print("Matriz de Co-ocorrência", co_matrix.shape)

        # Lista de arestas
        edgelist = edge_list(entities_list, co_matrix)

        # Rede
        G = nx.Graph()
        G.add_nodes_from(entities_list)
        G.add_edges_from(edgelist)

        print('Personagens (nodes)', G.number_of_nodes())
        print('Ocorrência (edges)', G.number_of_edges())

        # Visualização de rede - 15 personagens com maior grau
        plot_network_degree(G, entities_list, folder, name, index)

        medidas_rede = medidas(G)
        print('Valores médios das medidas de rede:\n', medidas_rede)

        M.append(medidas_rede)
        
    return M

In [None]:
dfs = pd.read_csv('catalog_v0.xlsx - success.csv')
medidas_success = medidas_folder(folder="success", df=dfs)

In [None]:
dfo = pd.read_csv('catalog_v0.xlsx - other.csv')
medidas_other = medidas_folder(folder="other", df=dfo)

# Classificação

In [None]:
# other - Others (1895 - 1923)
dfo = pd.read_csv('catalog_v0.xlsx - other.csv')
dfo['Classe'] = 0
dfo['Medidas'] = medidas_other
dfo.head()

In [None]:
# success- Publishers Weekly’s Bestselling Novels
dfs = pd.read_csv('catalog_v0.xlsx - success.csv')
dfs['Classe'] = 1
dfs['Medidas'] = medidas_success
dfs.head()

In [None]:
concat = pd.concat([dfs, dfo], axis=0)
concat = concat.sample(frac=1).reset_index(drop=True)
display(concat)

In [None]:
for i in range(len(concat)):
    name = ''.join(concat["Title"][i].split())
    index = concat["Index"][i]
    namesearch = str(index) + '_' + str(name)
    print(namesearch)

In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter