In [4]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [5]:
# import wikipedia sentences
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")
candidate_sentences.shape

(4318, 1)

In [6]:
candidate_sentences['sentence'].sample(5)

206     this extraordinarily rich and innovative silent classic  is a dickensian ghost story and a deeply moving morality tale, as well as a showcase for groundbreaking special effects.
2250                                                                                                                           raees was said to be based on criminal abdul latif's life.
2473                                                                                                        during the 2019 d23 expo, it was revealed that there will be seven new songs.
2659                                                                                                                                              rallycross car pictured on agfa 1000 rs
700                                                                       first was the involvement of actor and utah resident robert redford, who became the festival's inaugural chair.
Name: sentence, dtype: object

In [7]:
doc = nlp("the drawdown process is governed by astm standard d823")

for tok in doc:
  print(tok.text, "...", tok.dep_)

the ... det
drawdown ... compound
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... pobj
d823 ... punct


In [8]:

"""
These nodes are going to be the entities that are present in the Wikipedia sentences. 
Edges are the relationships connecting these entities to one another.
We will extract these elements in an unsupervised manner, i.e., 
we will use the grammar of the sentences.
"""

'\nThese nodes are going to be the entities that are present in the Wikipedia sentences. \nEdges are the relationships connecting these entities to one another.\nWe will extract these elements in an unsupervised manner, i.e., \nwe will use the grammar of the sentences.\n'

In [9]:
#The main idea is to go through a sentence and extract the subject and the object 
#as and when they are encountered


In [10]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [11]:
get_entities("the film had 200 patents")


['film', '200  patents']

In [12]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

100%|█████████████████████████████████████████████████████████████████████████████| 4318/4318 [00:31<00:00, 137.99it/s]


In [13]:
entity_pairs[10:20]

[['we', 'tests'],
 ['m', 'international sales rights'],
 ['musician robbie robertson', 'soundtrack'],
 ['it', 'original music tracks'],
 ['it', 'reviewed  franchise'],
 ['she', 'accidentally  mystique'],
 ['military  forces', 'arrest'],
 ['train', 'vuk'],
 ['kota eberhardt', 'telepath selene gallio'],
 ['singer', '-']]

In [42]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern], on_match=None) 

  matches = matcher(doc)
  #k = len(matches) - 1

  span = doc[matches[0][1]:matches[0][2]] 
  return(span.text)





In [41]:
get_relation("John completed the task")


'completed'

In [43]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]





  0%|                                                                                         | 0/4318 [00:00<?, ?it/s][A[A[A[A



  0%|▎                                                                              | 15/4318 [00:00<00:28, 149.82it/s][A[A[A[A



  1%|▌                                                                              | 29/4318 [00:00<00:29, 145.84it/s][A[A[A[A



  1%|▊                                                                              | 45/4318 [00:00<00:28, 147.36it/s][A[A[A[A



  1%|█                                                                              | 59/4318 [00:00<00:29, 142.44it/s][A[A[A[A



  2%|█▎                                                                             | 73/4318 [00:00<00:30, 141.29it/s][A[A[A[A



  2%|█▌                                                                             | 88/4318 [00:00<00:29, 143.69it/s][A[A[A[A



  2%|█▉                                            

IndexError: list index out of range





  6%|████▍                                                                         | 249/4318 [00:18<00:31, 128.89it/s][A[A[A[A