In [1]:
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import glob
import io

In [2]:
import neuralcoref
import spacy

In [4]:
def standardize_text(book, cutting_flag=False):
    # removing all \r
    book = book.replace('\r', '')
    # marking all paragraph starts (\n\n) with .\r for later
    book = book.replace('\n\n', '\r')
    # replacing all \r with \n\n to remark the paragraphs
    book = book.replace('\r', '\n\n')
    # replacing any possible duplicated full stops
    book = book.replace('..', '.')
    # reconstructing possible damaged ellipsis (...)
    book = book.replace('..', '...')
    # removing underscores
    book = book.replace('_', '')
    # removing guion
    book = book.replace('-', ' ')
    return book

In [5]:
def remove_chapter_markers(book, book_id):
    paragraphs = book.split('\n\n')
    paragraphs_new = []
    chapters = []
    old_idx = 0
    contents = []
    for paragraph in paragraphs:
        lines = paragraph.split('\n')
        if re.match('[IVXLCDM\d]+[\.]*', lines[0].upper().strip()):
            if len(contents) == 0:
                contents = [" ".join(line.lower().split()) for line in lines]
                if len(contents) == 0:
                    print("Not found content", book_id)
                    exit()
            continue
        if paragraph.lower().strip() in contents:
            if old_idx == 0:
                old_idx = len(paragraphs_new)
            else:
                current_idx = len(paragraphs_new)
                chapter = "\n\n".join(paragraphs_new[old_idx:current_idx])
                chapters.append(chapter)
                old_idx = current_idx
            continue
        if not re.match('.*[\w]+.*', paragraph):
            continue
        paragraphs_new.append(paragraph)
    if old_idx != 0:#last chapter
        chapter = "\n\n".join(paragraphs_new[old_idx:len(paragraphs_new)])
        chapters.append(chapter)
    print(len(paragraphs), len(paragraphs_new), len(chapters))
    return paragraphs_new, chapters

In [6]:
def retrieve_text(index):
    f = io.open(str(index) + '.txt', 'r', encoding='utf-8')
    text = f.read()
    f.close()
    return text

In [14]:
def extract_ner_par(book_id):
    book = standardize_text(retrieve_text(book_id))
    paragraphs, chapters = remove_chapter_markers(book, book_id)
    geral_entities = []
    with open(book_id + ".txt", "w") as f:
        for i, paragraph in enumerate(paragraphs):
            paragraph = paragraph.replace('\n', ' ')
            paragraph = " ".join(paragraph.split())
            doc = nlp(paragraph)
            doc_coref = doc._.coref_resolved
            doc_coref = nlp(doc_coref)
            entities = []
            for ent in doc_coref.ents:
                if ent.label_ == "PERSON":
                    entities.append(ent.text)
        if len(entities)>0:
            len_words = len(paragraph.split())
            geral_entities.extend(entities)
            f.write("["+str(len_words)+"]")
            f.write(", ".join(entities)+"\n")
    print(set(geral_entities))

In [8]:
print(spacy.__version__)

2.1.3


In [9]:
print(neuralcoref.__version__)

4.0.0


In [10]:
book_id = 13
dir_books = "books_dataset_1/"
txt_files = glob.glob(dir_books + "*.txt")

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f561f2d2f50>

In [None]:
exclude_author = []
for txt_file in txt_files:
    id_file = txt_file.replace(".txt","")
    print("Reading..." + id_file)
    extract_ner_par(id_file)


Reading...books_dataset_1/CharlesDarwin_GeologicalObservationsOnSouthAmerica
1 1 0
{'Reeks', 'Quebrada Onda', 'Mesodesma', 'Ann', 'JULIAN', "M. Parchappe's", 'S. Andres', "M. d'Orbigny's", 'Meyen', 'W. 25 degrees', 'Spongolithis Fustis', 'Megalonyx Jeffersonii', 'H. De la Beche', 'Brand "Travels', 'Bettington', 'Sentinella', '4]Spirifer', 'A. Hayes', 'Owen', 'S. Wolcotii', 'Balanus', 'S. Anna', 'Iquique', 'Domeykus', 'Plazilla', 'Sulivan', 'Huapi', 'Turritella', 'M. E. de Beaumont', 'M. Terrace 2', 'M. Rose', 'Perna Americana', 'Murchison', 'M. Isabelle', '—of Coquimbo', 'Weaver', 'Glyptodon', 'Mytilus Magellanicus', 'Crassatella', 'Terebra', 'Formen', 'Himalaya', 'R. Brown', 'Pernambuco', 'B. Tulipa', 'Calyptraea', 'Maria', "D'Orbigny", 'Mary', 'M. Tschudi', 'Eunotia', 'Gamboa', 'Lumb', 'W. 19 degrees', 'Festlande', "D'Aubuisson", 'H. Unanue', "M. D'Orbigny T.", 'S. Pedro de Nolasko', 'Fusus Cleryanus', 'M. Domeyko', 'Sierras Tapalguen', 'G. Orientalis', 'Williams', 'Isabellei', 'M. T

{'Adam', 'Lady', 'Silas Foster Foster', 'Eliot', 'Hollingsworth', 'Westervelt', '59]Silas Foster', '59]Eliot', 'Foster', "George Sand's", 'Moodie', 'Esquimaux', "O'Shanter", 'Silas', 'Margaret Fuller', 'Silas Foster', 'Burns', 'Duke', 'Tableaux', 'Sisera', 'Lynn', "Tam O'Shanter", 'Jim Crow', '15]Priscilla', 'Theodore', '173]Paul Dudley', 'Charles', 'Goldsmith', 'Eve', 'Priscilla', '85]Silas Foster', 'Devil', 'Michael Scott', '40]Silas Foster', 'Priscilla Priscilla', '87]Theodore'}
Reading...books_dataset_1/ ZaneGrey_TheManoftheForest
1 1 0
