In [1]:
import os
import io
import re

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
import neuralcoref
import spacy 

In [None]:
print('spacy version:', spacy.__version__)
print('neuralcoref version:', neuralcoref.__version__)

In [None]:
def read_file(path):
    try:
        with io.open(path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        try:
            with io.open(path, 'r', encoding='latin-1') as f:
                return f.read()
        except UnicodeDecodeError:
            return "Could not decode"
    

In [None]:
folder = "success"

# files sorted by number
files_ordered = sorted(os.listdir(folder), key=lambda name:int(name.split('_')[0]))

# an example file (1)
file = files_ordered[2]
# name file
path = os.path.join(folder, file)

if os.path.isfile(path):
    content = read_file(path)
    print(f"Content of book ... {file}\n")
    print(content)

In [None]:
paragraphs = content.split('\n\n')
print(paragraphs)

In [None]:
paragraphs_new = []
contents = []
chapters = []
old_idx = 0

for paragraph in paragraphs:
    # print("\n Paragraph: ", paragraph)
    
    lines = paragraph.split('\n')
    # print(lines)
    if re.match('[IVXLCDM\d]+[\.]*', lines[0].upper().strip()):
        if len(contents) == 0:
            contents = [" ".join(line.lower().split()) for line in lines]
            # print("contents", contents)
            
            if len(contents) == 0:
                print("Not found content", id_file)
                exit()
        continue
           
    if paragraph.lower().strip() in contents:
        if old_idx == 0:
            old_idx = len(paragraphs_new)
        else:
            current_idx = len(paragraphs_new)
            chapter = "\n\n".join(paragraphs_new[old_idx:current_idx])
            chapters.append(chapter)
            old_idx = current_idx
        continue
    
    if not re.match('.*[\w]+.*', paragraph):
        continue
    
    paragraphs_new.append(paragraph)
 

    if old_idx != 0:
        chapter = "\n\n".join(paragraphs_new[old_idx:len(paragraphs_new)])
        chapters.append(chapter)
        
print(len(paragraphs), len(paragraphs_new),
      #len(chapters)
     )    

In [None]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

In [None]:
geral_entities = []

for i, paragraph in enumerate(paragraphs_new[0:15]):
    paragraph = paragraph.replace('\n', ' ')
    paragraph = " ".join(paragraph.split())
    # print('\n', paragraph)****
    
    doc = nlp(paragraph)
    # print(doc)
    
    doc_coref = doc._.coref_resolved
    doc_coref = nlp(doc_coref)
    # print(doc_coref)
    
    entities = []
    for ent in doc_coref.ents:
        if ent.label_ == "PERSON":
            entities.append(ent.text)
    # print(entities)
    
    if len(entities) > 0:
        len_words = len(paragraph.split())
        geral_entities.extend(entities)
        # print('\n', paragraph)
        print(i, len_words, entities)
    
print("\nGeral Entities:\n", set(geral_entities))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Sample data
data = [
    # (0, 91, ['Young Hillocks', 'Hillocks', 'Drumsheugh']),
    (1, 168, ['Keep', 'Dominie', 'Keep', 'Keep', 'Geordie Hoo', 'Keep']),
    # (2, 160, ['Tonic Sol-fa']),
    # (4, 188, ['Drumtochty', 'Dominie Jamieson']),
    # Add more data
]

# Create a graph using NetworkX
G = nx.Graph()

# Iterate through data and extract character co-occurrences
for _, _, characters in data:
    for i, character in enumerate(characters):
        for other_character in characters[i + 1:]:
            if G.has_edge(character, other_character):
                G[character][other_character]['weight'] += 1
            else:
                G.add_edge(character, other_character, weight=1)

# Visualization using NetworkX
pos = nx.spring_layout(G, seed=42)
edge_labels = {(u, v): f"{d['weight']}" for u, v, d in G.edges(data=True)}

nx.draw(G, pos, with_labels=True, node_size=100, font_size=10, font_color='black', font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

plt.title("Character Co-occurrence Network")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Informações fornecidas
edges = [
    ("Philip Horn", "William Cliff"),
    ("Philip Horn", "Castor"),
    ("Philip Horn", "Edna Markham"),
    ("Philip Horn", "Ralph"),
    ("William Cliff", "Castor"),
    ("William Cliff", "Edna Markham"),
    ("William Cliff", "Ralph"),
    ("William Cliff", "Ralph Markham"),
    ("Castor", "Edna Markham"),
    ("Edna Markham", "Ralph"),
    ("Ralph", "Ralph Markham"),
    ("Ralph", "Cliff"),
    ("Ralph", "Markham"),
    ("Ralph", "Maka"),
    ("Ralph Markham", "Cliff"),
    ("Cliff", "Markham"),
    ("Captain Horn", "Horn"),
    ("Captain Horn", "Rynders"),
    ("Rynders", "Maka")
]

# Criação do grafo
G = nx.Graph()
G.add_edges_from(edges)

# Visualização do grafo
pos = nx.spring_layout(G)
plt.figure(figsize=(10, 8))
nx.draw(G, pos, with_labels=True,
        node_color="skyblue", node_size=300, font_size=9, font_weight='bold', edge_color='gray', width=1)
plt.title("Rede de Personagens do Livro")
plt.show()
