This notebook explores examples of the extracted relationships with OpenIE.

The results of this notebook have been stored in csv and pickle files.
All code cells concerned with the generaiton of those file have been commented out. In case anyone wants to run this file, download the csv files from the google drive link into the folder called "files".

# Imports

In [1]:
import _pickle as pickle
import pandas as pd
from tqdm.notebook import tqdm
import os
from collections import defaultdict

In [2]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Load Triple Dict

Merged dictionary of all extracted relationship triples from the Harry Potter articles.

In [3]:
with open(('files\\Harry_Potter_Triple_Labeled'+ '.pickle'), 'rb') as file:
    triple_dicts = pickle.load(file)

In [5]:
triple_dicts[:20]

[{'subject': ('Nathan Lubbock Smith', 'NO LABEL'),
  'relation': 'is',
  'object': ('actor', 'NO LABEL')},
 {'subject': ('Quidditch Captain', 'PERSON'),
  'relation': 'is',
  'object': ('additional position in sport of Quidditch', 'NO LABEL')},
 {'subject': ('Quidditch Captain', 'PERSON'),
  'relation': 'is',
  'object': ('position in sport of Quidditch', 'NO LABEL')},
 {'subject': ('Captain', 'PRODUCT'),
  'relation': 'is additional position in',
  'object': ('sport of Quidditch', 'NO LABEL')},
 {'subject': ('Quidditch Captain', 'PERSON'),
  'relation': 'is',
  'object': ('additional position in wizarding sport of Quidditch',
   'NO LABEL')},
 {'subject': ('Captain', 'PRODUCT'),
  'relation': 'is additional',
  'object': ('sport', 'NO LABEL')},
 {'subject': ('Captain', 'PRODUCT'),
  'relation': 'is additional',
  'object': ('sport', 'NO LABEL')},
 {'subject': ('Captain', 'PRODUCT'),
  'relation': 'is position in',
  'object': ('sport', 'NO LABEL')},
 {'subject': ('Captain', 'PRODUCT')

In [23]:
with open(('files\\label_pair_count_dict'+ '.pickle'), 'rb') as file:
    label_pair_count_dict = pickle.load(file)

In [24]:
# label_pair_count_dict

In [25]:
remove_list = []
for label_pairs in label_pair_count_dict.keys():
    if label_pairs[0] == 'NO LABEL' or label_pairs[1] == 'NO LABEL':
        remove_list.append(label_pairs)
for item in remove_list:
    del label_pair_count_dict[item]

Most frequent subject - object NER label pairs

In [27]:
pd.DataFrame({k: v for k, v in sorted(label_pair_count_dict.items(), key=lambda item: item[1], reverse=True)}.items()).head(100)

Unnamed: 0,0,1
0,"(PERSON, PERSON)",29428
1,"(PERSON, ORG)",21063
2,"(ORG, PERSON)",14821
3,"(PERSON, DATE)",13683
4,"(ORG, ORG)",13331
5,"(PERSON, GPE)",6766
6,"(GPE, PERSON)",5522
7,"(ORG, DATE)",5082
8,"(GPE, ORG)",3590
9,"(ORG, GPE)",2900


# Exploring PERSON - PERSON Relationships

In [9]:
end = len(triple_dicts)
end

2995084

In [10]:
PER_PER_relatioships = defaultdict(int)
for i, dct in tqdm(enumerate(triple_dicts), total=end):
    if dct['subject'][1] == 'PERSON' and  dct['object'][1] == 'PERSON':
        PER_PER_relatioships[dct['relation']] += 1

  0%|          | 0/2995084 [00:00<?, ?it/s]

Most frequent relationship phrases in PERSON - PERSON relationships

In [20]:
pd.DataFrame({k: v for k, v in sorted(PER_PER_relatioships.items(), key=lambda item: item[1], reverse=True)}.items()).head(100)

Unnamed: 0,0,1
0,fought in,468
1,met,427
2,in,409
3,told,402
4,was,323
5,'s son is,301
6,was killed by,285
7,returned to,279
8,has,278
9,at,263


In [13]:
PER_PER_names = defaultdict(int)
for i, dct in tqdm(enumerate(triple_dicts), total=end):
    if dct['subject'][1] == 'PERSON' and  dct['object'][1] == 'PERSON':
        PER_PER_names[(dct['subject'][0], dct['object'][0])] += 1

  0%|          | 0/2995084 [00:00<?, ?it/s]

Most frequent subject - object pairs in PERSON - PERSON relationships

In [19]:
pd.DataFrame({k: v for k, v in sorted(PER_PER_names.items(), key=lambda item: item[1], reverse=True)}.items()).head(100)

Unnamed: 0,0,1
0,"(Godric, Hollow)",750
1,"(Rita, Harry)",594
2,"(Harry, Ron)",439
3,"(Ron, Harry)",432
4,"(Pettigrew, Harry)",361
5,"(Ginny, Harry)",320
6,"(Ron, Battle)",254
7,"(Hog, Head)",248
8,"(Dobby, Harry)",240
9,"(Minerva, Harry Potter)",222


In [17]:
married_df = pd.DataFrame(columns = ['subject', 'married', 'object'])
for i, dct in tqdm(enumerate(triple_dicts), total=end):
    if dct['relation'] == 'married':
        df = {'subject': dct['subject'][0], 'married': dct['relation'], 'object': dct['object'][0]}
        married_df = married_df.append(df, ignore_index = True)

  0%|          | 0/2995084 [00:00<?, ?it/s]

In [18]:
married_df.head(100)

Unnamed: 0,subject,married,object
0,Ron,married,Hermione
1,Ron,married,Hermione
2,She,married,Harry Potter
3,he,married,Ginny
4,he,married,Ginny
5,Bill,married,Fleur Delacour
6,She,married,Ron Weasley
7,Ron,married,Hermione
8,Ron,married,Hermione
9,Ron,married,Hermione
