# Relation Extraction

In [1]:
import pandas as pd
import csv
import numpy as np
import os
import opennre
import re
from openie import StanfordOpenIE

In [8]:
concept_pairs = [('kylo_ren', 'r2-d2'), 
                 ('kylo_ren', 'kuiil'), 
                 ('kylo_ren', 'ben'), 
                 ('kylo_ren', 'anakin'), 
                 ('kylo_ren', 'princess_leia'), 
                 ('favreau', 'we'), 
                 ('favreau', 'november'), 
                 ('favreau', 'you'), 
                 ('favreau', 'disney'),
                 ('favreau', 'lucas'),
                 ('the_events', 'the_last_jedi'), 
                 ('the_events', 'the_galactic_republic'), 
                 ('the_events', 'the_clone_wars'),
                 ('the_events', 'a_new_hope')]
len(concept_pairs)

14

In [108]:
concept_pairs = [('an_agent', 'r2-d2'), 
                 ('an_agent', 'kuiil'), 
                 ('an_agent', 'ben'), 
                 ('an_agent', 'anakin'), 
                 ('an_agent', 'princess_leia')]
len(concept_pairs)

5

In [None]:
Closest Related Pair: (r2-d2, kueller), PMI: 1.6314168191528755
Closest Related Pair: (princess_leia, their_real_mother.), PMI: 2.3107706775804853
Closest Related Pair: (ben, son.), PMI: 2.3107706775804853
Closest Related Pair: (anakin, replies), PMI: 0.8082165103447326
Closest Related Pair: (kuiil, occasionally), PMI: 2.1353220000742925
Closest Related Pair: (kylo_ren, an_agent), PMI: 5.502617830060767

In [2]:
dataset = pd.read_csv('./dataset/starwars_text_dataset_cleaned.txt', delimiter='\n', header=None, error_bad_lines=False)
dataset

b'Skipping line 2445: expected 1 fields, saw 2\nSkipping line 3096: expected 1 fields, saw 2\nSkipping line 10258: expected 1 fields, saw 2\nSkipping line 11580: expected 1 fields, saw 2\nSkipping line 16550: expected 1 fields, saw 2\nSkipping line 17640: expected 1 fields, saw 2\n'


Unnamed: 0,0
0,luke_skywalker is a_fictional_character and th...
1,"portrayed by mark_hamill, luke first appeared ..."
2,": the_force awakens (2015),the"
3,"last_jedi (2017), and the_rise of luke_skywalk..."
4,"the_rescue"" (2020), voicing the_character that..."
...,...
20509,"in 2016, serkis was nominated for an_mtv_movie..."
20510,some_viewers felt that snoke's_character_arc w...
20511,various_fan_theories about his_origins were he...
20512,serkis addressed the_criticisms by saying prod...


### OpenNRE

In [4]:
re_model = opennre.get_model('wiki80_bertentity_softmax')
re_model.infer({'text': 'He was the son of Máel Dúin mac Máele Fithrich, and grandson of the high king Áed Uaridnach (died 612).', 'h': {'pos': (18, 46)}, 't': {'pos': (78, 91)}})

2021-03-29 12:45:03,237 - root - INFO - Loading BERT pre-trained checkpoint.


('father', 0.9927453398704529)

In [None]:
for index, row in dataset.iterrows():
    text = dataset.iloc[index][0]
    for concept1, concept2 in concept_pairs:
        found1 = re.search('(^|\W)'+concept1+'($|\W)', text)
        found2 = re.search('(^|\W)'+concept2+'($|\W)', text)
        if found1 is not None and found2 is not None:
            relation_pred = re_model.infer({'text': text, 'h': {'pos': found2.span()}, 't': {'pos': found1.span()}})
            print('Concepts: ({}, {}), Sentence: {}, Relation: {}'.format(concept1, concept2, text, relation_pred))            
            print()

In [None]:
opennre.download('wiki_distant', root_path='./Wiki_Distant')

### Stanford OpenIE

In [None]:
final_triples = []
for index, row in dataset.iterrows():
    text = dataset.iloc[index][0]
    
    for concept1, concept2 in concept_pairs:
        found1 = re.search('(^|\W)'+concept1+'($|\W)', text)
        found2 = re.search('(^|\W)'+concept2+'($|\W)', text)
        
#         if found1 is not None and found2 is not None:
        if concept1 in text and concept2 in text:
            doc = nlp(text)
            sentences = [sent.string.strip() for sent in doc.sents]
            triples = []
            for sentence in sentences:
                triples += [p.triple for p in minie.get_propositions(sentence)]
            
            for t in triples:
#                 print(t)
                if concept1 in t and concept2 in t:
                    final_triples.append((concept1, t[1], concept2))
                    print()
                    print('({}, {})'.format(concept1, concept2))
                    print(text)
                    print("\t{}".format(t))
                    print()

In [None]:
final_triples = []
with StanfordOpenIE() as client:
    for index, row in dataset.iterrows():
        text = dataset.iloc[index][0]
        for concept1, concept2 in concept_pairs:
            found1 = re.search('(^|\W)'+concept1+'($|\W)', text)
            found2 = re.search('(^|\W)'+concept2+'($|\W)', text)
            
            if found1 is not None and found2 is not None:
                doc = nlp(text)
                sentences = [sent.string.strip() for sent in doc.sents]

                triples = []
                for sentence in sentences:
                    for triple in client.annotate(sentence):
                        triples.append(triple)

                print(concept1, concept2)
                for t in triples:
                    print(t)
                    print()
                    
                    if concept1 in t and concept2 in t:
                        final_triples.append((concept1, t[1], concept2)) 
                        print()
                        print('({}, {})'.format(concept1, concept2))
                        print("\t{}".format(t))
                        print()

In [43]:
final_triples

[]

### MinIE

In [24]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [25]:
import os
os.environ['CLASSPATH'] = '../../../miniepy/minie-0.0.1-SNAPSHOT.jar'
from miniepy import *

minie = MinIE()

In [111]:
final_triples1 = []
final_triples2 = []
final_triples_both = []

triple_collection = {}
tokenizer = spacy.tokenizer.Tokenizer(spacy.lang.en.English().vocab)

for index, row in dataset.iterrows():
    text = dataset.iloc[index][0]
    
    for concept1, concept2 in concept_pairs:
        found1 = re.search('(^|\W)'+concept1+'($|\W)', text)
        found2 = re.search('(^|\W)'+concept2+'($|\W)', text)
        
        if found1 is not None and found2 is not None:
#         if concept1 in text and concept2 in text:
            doc = nlp(text)
            sentences = [sent.string.strip() for sent in doc.sents]
            triples = []
            for sentence in sentences:
                triples += [p.triple for p in minie.get_propositions(sentence)]
            
            print(concept1, concept2)
            for t in triples:
#                 print(t)
                if len(t) != 3: continue
                
                if concept2 in t[0] or concept2 in t[2]:
                    final_triples2.append(t)
                if concept1 in t[0] or concept1 in t[2]:
                    final_triples1.append(t)
                    if concept2 in t[0] or concept2 in t[2]:
                        final_triples_both.append(t)
        

an_agent ben


In [115]:
final_triples1

[('in 1977 leia is princess of the_planet_alderaan',
  'is a_member of',
  'an_agent')]

In [116]:
final_triples2

[('leia', 'have a_son named', 'ben solo'),
 ('han', 'have a_son named', 'ben solo'),
 ('ben solo',
  'adopted',
  'the_name kylo_ren after turning to the_dark_side of the_force'),
 ('ben solo', 'became the_lead_enforcer for', 'the_first_order')]

In [117]:
final_triples_both

[]

In [118]:
concept_pairs = [('an_agent', 'r2-d2'), 
                 ('an_agent', 'kuiil'), 
                 ('an_agent', 'ben'), 
                 ('an_agent', 'anakin'), 
                 ('an_agent', 'princess_leia')]
len(concept_pairs)

5

In [120]:
ontology = []
for concept1, concept2 in concept_pairs:
    for f1 in final_triples1:
        if concept1 in f1[0] or concept1 in f1[2]:
            ontology.append([concept1, f1[1], ])
#     print(concept1, concept2)

('in 1977 leia is princess of the_planet_alderaan', 'is a_member of', 'an_agent')
('in 1977 leia is princess of the_planet_alderaan', 'is a_member of', 'an_agent')
('in 1977 leia is princess of the_planet_alderaan', 'is a_member of', 'an_agent')
('in 1977 leia is princess of the_planet_alderaan', 'is a_member of', 'an_agent')
('in 1977 leia is princess of the_planet_alderaan', 'is a_member of', 'an_agent')


In [102]:
#         concept1_words = [token.orth_ for token in tokenizer(concept1)]
#         concept2_words = [token.orth_ for token in tokenizer(concept2)]
            
        
    
#         for word1 in concept1_words:
#             for word2 in concept2_words:
#                 if word1 in text and word2 in text:
#                     doc = nlp(text)
#                     sentences = [sent.string.strip() for sent in doc.sents]
#                     triples = []
#                     for sentence in sentences:
#                         triples += [p.triple for p in minie.get_propositions(sentence)]
        
#         for t in triples:
#             if len(t) != 3: continue
            
#             for word1 in concept1_words:
#                 for word2 in concept2_words:
#                     if word1 in t[0] or word1 in t[2]:
#                         if word2 in t[0] or word2 in t[2]:
#                             print(word1, word2)
#                             print(t)
#                             final_triples1.append(t)
#                             print()
        
#         if concept1 in text and concept2 in text:
#             print(concept1, concept2)
#             doc = nlp(text)
#             sentences = [sent.string.strip() for sent in doc.sents]
#             triples = []
#             for sentence in sentences:
#                 triples += [p.triple for p in minie.get_propositions(sentence)]
            
#             print(concept1, concept2)
#             print(text)
#             print()
#             for t in triples:
#                 if len(t) != 3: continue
            
#                 concept1_words = [token.orth_ for token in tokenizer(sent)]
#                 print(concept1_words)
                
#                 concept1_words = concept1.split('_')
#                 for word in concept1_words:
#                     if word in t[0] or word in t[2]:
#                         if word not in triple_collection:
#                             triple_collection[word] = [t]
#                         else:
#                             if t not in triple_collection[word]:
#                                 triple_collection[word].append(t)
                
#                 concept2_words = concept2.split('_')
#                 for word in concept2_words:
#                     if word in t[0] or word in t[2]:
#                         if word not in triple_collection:
#                             triple_collection[word] = [t]
#                         else:
#                             if t not in triple_collection[word]:
#                                 triple_collection[word].append(t)

In [66]:
final_triples2

[('leia', 'have a_son named', 'ben solo'),
 ('han', 'have a_son named', 'ben solo'),
 ('ben solo',
  'adopted',
  'the_name kylo_ren after turning to the_dark_side of the_force'),
 ('ben solo', 'became the_lead_enforcer for', 'the_first_order'),
 ('none other than former_jedi_knight_anakin_skywalker',
  'become',
  'darth_vader_father'),
 ('none other than former_jedi_knight_anakin_skywalker',
  'become',
  'darth_vader')]

In [52]:
final_triples_both

[]

In [83]:
triple_collection.keys()

dict_keys(['leia', 'an', 'agent', 'ben', 'anakin'])

In [84]:
triple_collection['agent']

[('in 1977 leia is princess of the_planet_alderaan',
  'is a_member of',
  'an_agent')]