In [130]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
nlp = spacy.load('en_core_web_trf')

from tqdm import tqdm
import re
%matplotlib inline

In [13]:
nlp.pipeline

[('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x7fb6305f26d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fb6305f29f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fb6305f8520>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fb6305d7940>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fb6305d4200>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fb6305aed60>)]

In [77]:
string = """Brown bears have long claws. 
The Brown bear has a brown fur. 
The Brown bear has a black nose.
The brown bear has a black nose.
The bear has a purple belly.
Their feet are orange.
The brown bear is similar to the black bear.
"""

In [181]:
string = """
Brown bears have very large and curved claws, those present on the forelimbs being longer than those on the hind limbs. They may reach 5 to 6 centimetres (2.0 to 2.4 in) and may measure 7 to 10 centimetres (2.8 to 3.9 in) along the curve.[69] They are generally dark with a light tip, with some forms having completely light claws.[62] Brown bear claws are longer and straighter than those of American black bears (Ursus americanus).[61] The claws are blunt, while those of a black bear are sharp. Due to their claw structure, in addition to their excessive weight, adult brown bears cannot typically climb trees as well as both species of black bear, although in rare cases adult female brown bears have been seen in trees.[70] The claws of a polar bear are also quite different, being notably shorter but broader with a strong curve and sharper point, presumably both as an aid to traveling over ice (sometimes nearly vertically) and procuring active prey.[23][71] The paws of the brown bear are quite large. The rear feet of adult bears have been found to typically measure 21 to 36 cm (8.3 to 14.2 in) long, while the forefeet tend to measure about 40% less in length. All four feet in average sized brown bears tend to be about 17.5 to 20 cm (6.9 to 7.9 in) in width. In large coastal or Kodiak bear males, the hindfoot may measure up to 40 cm (16 in) in length, 28.5 cm (11.2 in) in width, while outsized Kodiak bears having had confirmed measurements of up to 46 cm (18 in) along their rear foot.[72][73][74] Brown bears are the only extant bears with a hump at the top of their shoulder, which is made entirely of muscle, this feature having developed presumably for imparting more force in digging, which is habitual during foraging for most bears of the species and also used heavily in den construction prior to hibernation.[61][75] The brown bear's strength has been roughly estimated as 2.5 to 5 times that of a human.[76]
"""

In [182]:
def TextCleaner(text, splitComma=False, splitPointComma=False):
    
    """
    Split text into sentences and cleans them 
    from e.g. whitespaces and references.
    """
    
    # Split on commas
    if splitComma:
        text = text.replace(', ', '. ')
    if splitPointComma:
        text = text.replace('; ', '. ')
    text = nlp(text)
    # Init result
    result = []
    # Init regexes
    TextCleaner = [
            ' \(\d+.+?Close\n\t\n\)',
            '\[\d*\]',
            '\([^)]*\)'
    ]
    # Extract single senteces if large enough
    sentences = [str(sent) for sent in text.sents if len(sent) >= 3]
        
    for sentence in sentences:
        #print(sentence.lower())
        # Apply the regexes
        for Cleaner in TextCleaner:
            sentence = re.sub(Cleaner, '', sentence, flags=re.DOTALL)
        # Clean tabs and newlines
        sentence = sentence.replace('\n', '')
        sentence = sentence.replace('\t', '')
        # Add to result
        result.append(sentence.lower())
    
    return result

In [183]:
sentences_cleaned = TextCleaner(string)
sentences_cleaned

['brown bears have very large and curved claws, those present on the forelimbs being longer than those on the hind limbs.',
 'they may reach 5 to 6 centimetres  and may measure 7 to 10 centimetres  along the curve.',
 'they are generally dark with a light tip, with some forms having completely light claws.',
 'brown bear claws are longer and straighter than those of american black bears .',
 'the claws are blunt, while those of a black bear are sharp.',
 'due to their claw structure, in addition to their excessive weight, adult brown bears cannot typically climb trees as well as both species of black bear, although in rare cases adult female brown bears have been seen in trees.',
 'the claws of a polar bear are also quite different, being notably shorter but broader with a strong curve and sharper point, presumably both as an aid to traveling over ice  and procuring active prey.',
 'the paws of the brown bear are quite large.',
 'the rear feet of adult bears have been found to typicall

In [355]:
def NounChunkExtractor(sentence):
    
    """
    Extracts the noun chunks of a sentence.
    """
    
    # Init matcher
    matcher = Matcher(nlp.vocab)
    # Init empty list
    relations = []
    # Relation patterns
    AcompPattern = [
                [{"DEP": "ROOT"}, {"DEP": "advmod", "OP": "?"}, {"DEP": "acomp", "OP": "?"}, {"DEP": "prep", "OP": "?"}, {"DEP": "cc", "OP": "?"}, {"DEP": "conj", "OP": "?"}],
                [{"POS": "VERB"}, {"DEP": "acomp", "OP": "?"}, {"DEP": "prep", "OP": "?"}],
    ]
    # Add the matcher
    matcher.add("Description", AcompPattern)
    
    # Nlp the sentence
    sentence = nlp(sentence)    
    # Extract chunks
    chunks = [chunks.text for chunks in sentence.noun_chunks]  
    # Drop doubles
    relations = list(dict.fromkeys(relations))
    # Replace the relation if a match if found
    matches = matcher(sentence)
    if matches:
        for _, start, end in matches:
            #print(start, end)
            # The matched span 
            extracted = sentence[start:end]
            #print(extracted)
            relations.append(extracted)
        # Remove doubles
        relations = filter_spans(relations)
     
    if len(chunks) == 1:        
        RDF_1 = (chunks[0], relations[0][0], relations[0][1])
        RDF_2 = (None)
    
    elif len(chunks) == 2 and len(relations) == 2:
        RDF_1 = (chunks[0], relations[0][0], relations[0][1])        
        RDF_2 = (chunks[1], relations[1][0], relations[1][1])
        
    elif len(chunks) == 2 and len(relations) != 2:
        RDF_1 = (chunks[0], relations[0], chunks[1])
        RDF_2 = (None)
    
    elif len(chunks) == 4:
        RDF_1 = (chunks[0], relations[0], chunks[1])
        RDF_2 = (chunks[2], relations[1], chunks[3])
    
    else:
        RDF_1 = (None)
        RDF_2 = (None)


    return RDF_1, RDF_2

In [356]:
[NounChunkExtractor(i) for i in tqdm(sentences_cleaned)]

100%|███████████████████████████████████████████| 13/13 [00:01<00:00,  8.79it/s]


[(('brown bears', have very, 'very large and curved claws'),
  ('the forelimbs', being longer than, 'the hind limbs')),
 (('they', reach, '5 to 6 centimetres'),
  ('7 to 10 centimetres', measure, 'the curve')),
 (('they', are generally dark with, 'a light tip'),
  ('some forms', having, 'completely light claws')),
 (('brown bear claws', are longer and straighter, 'american black bears'),
  None),
 (('the claws', are, blunt), ('a black bear', are, sharp)),
 (None, None),
 (None, None),
 (('the paws', are quite large, 'the brown bear'), None),
 (None, None),
 (('all four feet', tend, 'average sized brown bears'),
  ('about 17.5 to 20 cm', be, 'width')),
 (None, None),
 (None, None),
 (("the brown bear's strength", estimated as, 'a human'), None)]

In [339]:
NounChunkExtractor(sentences_cleaned[-3])

(None, None)

In [276]:
for token in nlp(sentences_cleaned[3]):
    print(token.text, '--', token.pos_, '--', token.dep_)

brown -- ADJ -- amod
bear -- NOUN -- compound
claws -- NOUN -- nsubj
are -- AUX -- ROOT
longer -- ADJ -- acomp
and -- CCONJ -- cc
straighter -- ADJ -- conj
than -- SCONJ -- prep
those -- DET -- pobj
of -- ADP -- prep
american -- ADJ -- amod
black -- ADJ -- amod
bears -- NOUN -- pobj
. -- PUNCT -- punct


In [358]:
displacy.render(nlp(sentences_cleaned[-6]), style="dep")

In [351]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_trf")
doc = nlp('brown bears have very large and curved claws, those present on the forelimbs being longer than those on the hind limbs.')



In [352]:
# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
verbs

{have, being}

In [None]:
"""
def NounChunkExtractor(sentence):
    
    """
    Extracts the noun chunks of a sentence.
    """
    
    # Init matcher
    matcher = Matcher(nlp.vocab)
    # Init empty list
    AcompList = []
    # Relation patterns
    AcompPattern = [
                [{"DEP": "ROOT"}, {"DEP": "advmod", "OP": "?"}, {"DEP": "acomp"}, {"DEP": "prep", "OP": "?"}, {"DEP": "cc", "OP": "?"}, {"DEP": "conj", "OP": "?"}],
                [{"POS": "VERB"}, {"DEP": "acomp"}, {"DEP": "prep", "OP": "?"}]
    ]
    # Add the matcher
    matcher.add("Description", AcompPattern)
    
    # Nlp the sentence
    sentence = nlp(sentence)    
    # Extract chunks
    chunks = [chunks.text for chunks in sentence.noun_chunks]  
    # Extract relations
    #relation = [roots.root.head for roots in sentence.noun_chunks]
    relations = [verbs.text for verbs in sentence if verbs.pos_ == 'VERB' or verbs.pos_ == 'AUX']
    # Drop doubles
    relations = list(dict.fromkeys(relations))
    # Replace the relation if a match if found
    matches = matcher(sentence)
    if matches:
        for _, start, end in matches:
            #print(start, end)
            # The matched span 
            extracted = sentence[start:end]
            print(extracted)
            AcompList.append(extracted)
        # Remove doubles
        AcompList = filter_spans(AcompList)
        #print(AcompList)
        #print(relations)
        # Replace NEEDS TO BE ADJUSTED
        for count, relation in enumerate(relations):
            for acomps in AcompList:
                if relation in acomps.text:
                    relations[count] = acomps.text
    print(relations)
    
    if len(chunks) == 1:        
        RDF_1 = (chunks[0], relations[0][0], relations[0][1])
        RDF_2 = (None, None, None)
    
    if len(chunks) == 2:
        RDF_1 = (chunks[0], relations[0], chunks[1])
        RDF_2 = (None, None, None)
    
    if len(chunks) == 4:
        RDF_1 = (chunks[0], relations[0], chunks[1])
        RDF_2 = (chunks[2], relations[1], chunks[3])

    return RDF_1, RDF_2