In [None]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_md')

In [None]:
from spacy.tokens import DocBin
from tqdm.auto import tqdm

# Efficient structure for saving lots of parsed documents
doc_bin = DocBin(attrs=["HEAD", "TAG", "LEMMA", "DEP", "POS"], store_user_data=True)

# Reading the binary file, 558 pre-parsed texts (376MB)
with open('texts_english.bin', 'rb') as f:
    loaded_data = doc_bin.from_bytes(f.read())

In [None]:
# WARNING: loading all the 558 books takes roughly 6GB of RAM
# You might want to try on a small subset first
N_BOOKS = 30

docs = []
N_BOOKS = min(N_BOOKS, len(loaded_data))
for i, doc in zip(tqdm(range(N_BOOKS)), loaded_data.get_docs(nlp.vocab)):
    docs.append(doc)

In [None]:
# Each book is just a large Spacy Doc, with the information of the corresponding original text file
docs[0].user_data['file']

## Making it work on examples

In [None]:
d = nlp('What is Beauty?')
displacy.render(d)
for t in d:
    print(t, t.lemma_, t.pos_, t.tag_)

Here are some example sentences that can be useful for trying out your rule matching algorithm.

Do not be afraid of exploring the xenotheka data with simple rules (every appearance of the word beauty for instance) to find examples that could extend these.

In [None]:
valid_sentences = [
    'What is beauty?',
    'Beauty is something',
    'Beauty can be defined',
    'In my opinion, beauty is'
]
wrong_sentences = [
    "Athena's beauty",
    "The beauty of this approach lies in",
    "It is one of many beauties",
    "What strikes the viewer is the beauty of the place"
]

### Defining your rules

In [None]:
from spacy.matcher import Matcher
# Create a matcher object
matcher = Matcher(nlp.vocab)

matcher.add("isDefiningBeauty", None, [{"LEMMA": "beauty"}])

def is_token_defining_beauty(token):
    # Remember that a token has lots of info (token.lemma_, token.tag_, token.pos_)
    # And that one can access the neighbouring tokens, or the dependency tree (token.head, token.children)
    # The full sentence is also accessible (token.sent)
    
    # Dummy condition as an example, the sentence containing the token has to be have at least 2 tokens
    if len(token.sent) < 2:
        return False
    else:
        return True

# You might want to try different approaches
def is_defining_beauty_version_1(token):
    # the direct ancestor of the token is a verb
    if token.head.pos_ == 'VERB':
        return True
    else:
        return False

### Trying it on examples

In [None]:
# This code tries the rule you have created on the examples and tell you
# if valid sentences were missed or if invalid ones were matched

for txt in valid_sentences:
    d = nlp(txt)
    matches = matcher(d)
    found_it = False
    for match_id, start, end in matches:
        # Get the token from the starting position in the document
        t = d[start]
        # Add additional logic here
        if is_token_defining_beauty(t):
            found_it = True
    if not found_it:
        print('MISSING VALID')
        print(txt)
            
for txt in wrong_sentences:
    d = nlp(txt)
    matches = matcher(d)
    found_it = False
    for match_id, start, end in matches:
        # Get the token from the starting position in the document
        t = d[start]
        # Add additional logic here
        if is_token_defining_beauty(t):
            found_it = True
    if found_it:
        print('INVALID')
        print(t.sent)

## Trying on the Xenotheka data

In [None]:
for doc in tqdm(docs):
    matches = matcher(doc)
    for match_id, start, end in matches:
        # match_id is the id of the matching rule, this line is to get back the name 'myVeryOwnMacthingRule'
        string_id = nlp.vocab.strings[match_id]
        # Get the token from the starting position in the document
        t = doc[start]
        if is_token_defining_beauty(t):
            filename = doc.user_data['file']
            # Printing the original filename and the approximate position in the document
            print(f'{filename} (token {t.i}/{len(doc)} - {t.i*100/len(doc):f}%)')
            print(t.sent)
            print('---')