In [83]:
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import glob
import io
import os

In [2]:
import neuralcoref
import spacy

In [3]:
print('spacy version:', spacy.__version__)
print('neuralcoref version:', neuralcoref.__version__)

spacy version: 2.1.3
neuralcoref version: 4.0.0


In [114]:
def standardize_text(book, cutting_flag=False):
    # removing all \r
    book = book.replace('\r', '')
    # marking all paragraph starts (\n\n) with .\r for later
    book = book.replace('\n\n', '\r')
    # replacing all \r with \n\n to remark the paragraphs
    book = book.replace('\r', '\n\n')
    # replacing any possible duplicated full stops
    book = book.replace('..', '.')
    # reconstructing possible damaged ellipsis (...)
    book = book.replace('..', '...')
    # removing underscores
    book = book.replace('_', '')
    # removing guion
    book = book.replace('-', ' ')
    return book

In [115]:
def remove_chapter_markers(book, book_id):
    paragraphs = book.split('\n\n')
    paragraphs_new = []
    chapters = []
    old_idx = 0
    contents = []
    for paragraph in paragraphs:
        lines = paragraph.split('\n')
        if re.match('[IVXLCDM\d]+[\.]*', lines[0].upper().strip()):
            if len(contents) == 0:
            	contents = [" ".join(line.lower().split()) for line in lines]
            	if len(contents) == 0:
            		print("Not found content", book_id)
            		exit()
            continue	
        if paragraph.lower().strip() in contents:
            if old_idx == 0:
                old_idx = len(paragraphs_new)
            else:
                current_idx = len(paragraphs_new)
                chapter = "\n\n".join(paragraphs_new[old_idx:current_idx])
                chapters.append(chapter)
                old_idx = current_idx
            continue
        if not re.match('.*[\w]+.*', paragraph):
            continue
        paragraphs_new.append(paragraph)
    if old_idx != 0:#last chapter
        chapter = "\n\n".join(paragraphs_new[old_idx:len(paragraphs_new)])
        chapters.append(chapter)
    print(len(paragraphs), len(paragraphs_new), len(chapters))
    return paragraphs_new, chapters


In [89]:
def retrieve_text(index):
    f = io.open(str(index) + '.txt', 'r', encoding='utf-8')
    text = f.read()
    f.close()
    return text

In [121]:
def extract_ner_par(book_id, output_dir):
    print('book_id', book_id)
    print('output', output_dir)
    book_id2 = dir_books + book_id
    
    book = standardize_text(retrieve_text(book_id2))
    paragraphs, chapters = remove_chapter_markers(book, book_id2)
    
    geral_entities = []
    with open(output_dir + book_id + ".txt", "w") as f:
        for i, paragraph in enumerate(paragraphs):
            paragraph = paragraph.replace('\n', ' ')
            paragraph = " ".join(paragraph.split())
            doc = nlp(paragraph)
            doc_coref = doc._.coref_resolved
            doc_coref = nlp(doc_coref)
            entities = []
            for ent in doc_coref.ents:
                if ent.label_ == "PERSON":
                    entities.append(ent.text)
        if len(entities)>0:
            len_words = len(paragraph.split())
            geral_entities.extend(entities)
            f.write("["+str(len_words)+"]")
            f.write(", ".join(entities)+"\n")
    print(set(geral_entities))

In [91]:
dir_books = "books_dataset_1/"

In [119]:
output_dir = "processed/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [93]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f67b10e94d0>

In [117]:
txt_files = glob.glob(dir_books + "*.txt")

In [123]:
for txt_file in txt_files:
    print(txt_file)
    
    id_file = os.path.splitext(os.path.basename(txt_file))[0]
    
    # id_file = txt_file.replace(".txt","")
    
    print("\nReading..." + id_file)
    
    extract_ner_par(id_file, output_dir)
    

books_dataset_1/BoothbyGuy_37081.txt

Reading...BoothbyGuy_37081
book_id BoothbyGuy_37081
output processed/
1761 1428 0
{'Project Gutenberg'}
books_dataset_1/CharlesDarwin_GeologicalObservationsOnSouthAmerica.txt

Reading...CharlesDarwin_GeologicalObservationsOnSouthAmerica
book_id CharlesDarwin_GeologicalObservationsOnSouthAmerica
output processed/
1317 842 0
{'Project Gutenberg'}
books_dataset_1/ThomasHardy_AChangedManAndOtherTales.txt

Reading...ThomasHardy_AChangedManAndOtherTales
book_id ThomasHardy_AChangedManAndOtherTales
output processed/
1973 1604 0


KeyboardInterrupt: 