In [1]:
# Imports:
import spacy
import re

# Load Spacy's language model
nlp = spacy.load('en_core_web_lg')

In [2]:
sentence = "Mark Hitchings, Scheme Manager Swn Yr Afon contacted the department requesting an assessment on Mary behalf. Client is displaying paraniod behaviour, belives that other residents are spraying things in her flat. Mrs Hughes is causing arguments between other residents within the complex\n\nPlease contact Mark 01443 673329"

In [3]:
# Checking if a sentence is in caps:
def calculate_capital_percentage(string):
    total_letters = len(re.sub(r'[^a-zA-Z]', '', string))
    capital_letters = len(re.sub(r'[^A-Z]', '', string))
    capital_percentage = (capital_letters / total_letters) * 100
    return capital_percentage

# Function to clean text for NER:
def clean_text_for_NER(text):
    # Remove newline characters:
    text = re.sub(r'[\n\r]+', '. ',text)
    # Only keep Alphabets, Digits, Spaces, and Commonly Used Punctuations:
    text = re.sub(r'[^a-zA-Z0-9\s?,:"!.\']', '', text)
    # Remove extra spaces:
    text = re.sub(r'\s{2,}', ' ', text)
    # Join digits together:
    text = re.sub('(?<=\d) (?=\d)', '', text)
    text = text.strip()
    if calculate_capital_percentage(text) > 65:
        text = text.lower()
    return text

cleaned_data = [clean_text_for_NER(sentence)]

In [4]:
# Function to perform NER using NLTK:
def perform_spacy_ner(text):
    named_entities = nlp(text)
    return named_entities

# Perform NER to extract sentences:
ner_data = [perform_spacy_ner(text) for text in cleaned_data]

# Perform NER on the extracted sentences:
ner_complete = []
for data_point in ner_data:
    ner_sentences = [perform_spacy_ner(str(sent)) for sent in data_point.sents]
    ner_complete.append(ner_sentences)

In [5]:
datapoint = ner_complete[0]
print(datapoint)

[Mark Hitchings, Scheme Manager Swn Yr Afon contacted the department requesting an assessment on Mary behalf., Client is displaying paraniod behaviour, belives that other residents are spraying things in her flat., Mrs Hughes is causing arguments between other residents within the complex., Please contact Mark 01443673329]


In [6]:
datapoint_triplet = []
for sentence_ner in datapoint:
    # Get Noun Phrases:
    noun_phrases = [chunk.text for chunk in sentence_ner.noun_chunks]

    # Get the root of the dependency tree:
    root_token = [token for token in sentence_ner if token.head == token][0]

    # Recursive function to traverse the tree:
    def traverse_tree(token, depth=0):
        if token.dep_ == "ROOT":
            verb = token.text
        if "subj" in token.dep_:
            subjects.append(token.text)
        if "dobj" in token.dep_:
            objects.append(token.text)
        if depth == 0:
            for child in token.children:
                traverse_tree(child, depth + 1)
        return subjects, objects

    # Traversing the tree via Root Token (Verb):
    subjects = []
    objects = []
    subjects, objects = traverse_tree(root_token)

    # Getting Subject Phrases:
    subject_phrases = [phrase for phrase in noun_phrases if any(subj in phrase for subj in subjects)] \
                      + [subj for subj in subjects if not any(subj in phrase for phrase in noun_phrases)]

    # Getting Object Phrases:
    object_phrases = [phrase for phrase in noun_phrases if any(obj in phrase for obj in objects)] \
                     + [obj for obj in objects if not any(obj in phrase for phrase in noun_phrases)]

    # Create a sentence triplet using Subject Phrases + Verb + Object Phrases:
    sentence_triplet = [" ".join(subject_phrases), root_token.text, " ".join(object_phrases)]

    print("Subject: {}".format(subjects))
    print("Object: {}".format(objects))
    print("===============================\n")

    # Append the sentence triplets into a datapoint triplet:
    datapoint_triplet.append(sentence_triplet)

Subject: ['Hitchings']
Object: ['department']

Subject: ['Client']
Object: ['behaviour']

Subject: ['Hughes']
Object: ['arguments']

Subject: []
Object: ['Mark']



In [7]:
for triplet in datapoint_triplet:
    print(triplet)

['Mark Hitchings', 'contacted', 'the department']
['Client', 'displaying', 'paraniod behaviour']
['Mrs Hughes', 'causing', 'arguments']
['', 'contact', 'Mark']


In [8]:
# Rendering the Tree:
spacy.displacy.render(ner_complete[0])

In [9]:
for sentence_ner in ner_complete[0]:
    # Step 1: Get the root of the dependency tree (main verb)
    root_token = [token for token in sentence_ner if token.head == token][0]

    # Step 2: Define a recursive function to traverse the tree
    def traverse_tree(token, depth=0):
        print("  " * depth + token.text + " - " + token.dep_)
        for child in token.children:
            traverse_tree(child, depth + 1)

    # Step 3: Start traversal from the root
    print("Traversal of the dependency tree: {}".format(root_token))
    traverse_tree(root_token)
    print("=========================================================")

Traversal of the dependency tree: contacted
contacted - ROOT
  Hitchings - nsubj
    Mark - compound
    , - punct
    Afon - appos
      Manager - compound
        Scheme - compound
      Swn - compound
      Yr - compound
  department - dobj
    the - det
    requesting - acl
      assessment - dobj
        an - det
        on - prep
          behalf - pobj
            Mary - compound
  . - punct
Traversal of the dependency tree: displaying
displaying - ROOT
  Client - nsubj
  is - aux
  behaviour - dobj
    paraniod - amod
  , - punct
  belives - dep
    spraying - relcl
      that - mark
      residents - nsubj
        other - amod
      are - aux
      things - dobj
      in - prep
        flat - pobj
          her - poss
  . - punct
Traversal of the dependency tree: causing
causing - ROOT
  Hughes - nsubj
    Mrs - compound
  is - aux
  arguments - dobj
    between - prep
      residents - pobj
        other - amod
        within - prep
          complex - pobj
            the - 