In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

# Add neural coref to SpaCy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f0fb4620100>

In [3]:
# dataset import
df = pd.read_json('train.json',lines=True)

In [4]:
print(df)

      fold                   source  documentId  \
0        3  Diffbot Knowledge Graph       13205   
1        2  Diffbot Knowledge Graph           7   
2        1  Diffbot Knowledge Graph       29726   
3        4  Diffbot Knowledge Graph       13115   
4        2         DBpedia Abstract        7109   
...    ...                      ...         ...   
3972     3         DBpedia Abstract       29567   
3973     2         DBpedia Abstract       28215   
3974     4  Diffbot Knowledge Graph         990   
3975     4         DBpedia Abstract       22704   
3976     1  Diffbot Knowledge Graph         428   

                                           documentText  \
0     Documents Assist\n\nFlorida Cheap Divorce - Si...   
1     Tata Chemicals Europe\n\nTata Chemicals Europe...   
2     Morphine\n\nMorphine was an American alternati...   
3     Norman Nadorff\n\nNorman J. Nadorff is an inte...   
4     Abdullah bin Mohammed bin Saud Al Thani\n\nH.E...   
...                              

In [5]:
# Extract Triplets from KG base dataset
sub = []
rel = []
obj = []
sents = []
para = []
for pags in df['passages']:
    for p in pags:
        para.append(p)
        for f in p['facts']:
            temp_s = f['humanReadable'].split('>')
            if temp_s[1][2:] in  ['DATE_OF_BIRTH','PLACE_OF_RESIDENCE','PLACE_OF_BIRTH','NATIONALITY','EMPLOYEE_OR_MEMBER_OF','EDUCATED_AT' ]:
                sub.append(temp_s[0][1:])
                rel.append(temp_s[1][2:])
                obj.append(temp_s[2][2:])
                sents.append(re.sub('[<>]', '', f['annotatedPassage']))

In [6]:
df_temp = pd.DataFrame([sents,sub,rel,obj],index=['Sentences','Sub','Relation','Obj']).transpose()

In [7]:
# Extracted data
print(df_temp)

                                              Sentences  \
0     H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...   
1     H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...   
2     James Patrick Harris (born 1954), better known...   
3     James Patrick Harris (born 1954), better known...   
4     When I found myself downsized in 2010, I decid...   
...                                                 ...   
5387  He lived in Sandymount, Dublin, from 1976 unti...   
5388  Eric N. "E. J." Henderson (born August 3, 1980...   
5389  Eric N. "E. J." Henderson (born August 3, 1980...   
5390  Eric N. "E. J." Henderson (born August 3, 1980...   
5391  Eric N. "E. J." Henderson (born August 3, 1980...   

                                          Sub            Relation  \
0             بد الله بن محمد بن سعود آل ثاني         NATIONALITY   
1     Abdullah bin Mohammed bin Saud Al Thani         NATIONALITY   
2                                  Jim Harris         NATIONALITY   
3              

In [10]:
print(df_temp['Relation'].value_counts())

EMPLOYEE_OR_MEMBER_OF    1408
PLACE_OF_RESIDENCE       1187
PLACE_OF_BIRTH            909
EDUCATED_AT               756
DATE_OF_BIRTH             619
NATIONALITY               513
Name: Relation, dtype: int64

In [12]:
print('Taking time for mapping the entity...')
# Coreferance resoluation
coref_sents = []
for s in df_temp['Sentences']:
    doc = nlp(s)
    coref_sents.append(doc._.coref_resolved)

Taking time for mapping the entity...


In [9]:
coref_dict = []

for os,cs  in zip(df_temp['Sentences'], coref_sents):
    l_dict = dict(zip(list(os.split()), list(cs.split())))
    coref_dict.append(l_dict)

In [10]:
df_temp['coref_dict'] = coref_dict
df_temp['coref_sents'] = coref_sents
for i in range(len(df_temp)):
    try:
        df_temp['Sub'][i] = df_temp['coref_dict'][i][df_temp['Sub'][i]]
        df_temp['Obj'][i] = df_temp['coref_dict'][i][df_temp['Obj'][i]]
        
    except:
        pass

In [11]:
# Co-refered dataset
print(df_temp)

Unnamed: 0,Sentences,Sub,Relation,Obj,coref_dict,coref_sents
0,H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...,بد الله بن محمد بن سعود آل ثاني,NATIONALITY,Qatari,"{'H.E.': 'H.E.', 'Sheikh': 'Sheikh', 'Abdullah...",H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...
1,H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...,Abdullah bin Mohammed bin Saud Al Thani,NATIONALITY,Qatari,"{'H.E.': 'H.E.', 'Sheikh': 'Sheikh', 'Abdullah...",H.E. Sheikh Abdullah bin Mohammed bin Saud Al ...
2,"James Patrick Harris (born 1954), better known...",Jim Harris,NATIONALITY,American,"{'James': 'James', 'Patrick': 'Patrick', 'Harr...","James Patrick Harris (born 1954), better known..."
3,"James Patrick Harris (born 1954), better known...",James Patrick Harris,NATIONALITY,American,"{'James': 'James', 'Patrick': 'Patrick', 'Harr...","James Patrick Harris (born 1954), better known..."
4,"When I found myself downsized in 2010, I decid...",I,EDUCATED_AT,Specs Howard School of Media Arts,"{'When': 'When', 'I': 'I', 'found': 'found', '...","When I found myself downsized in 2010, I decid..."
...,...,...,...,...,...,...
5387,"He lived in Sandymount, Dublin, from 1976 unti...",He,PLACE_OF_RESIDENCE,Sandymount,"{'He': 'He', 'lived': 'lived', 'in': 'in', 'Sa...","He lived in Sandymount, Dublin, from 1976 unti..."
5388,"Eric N. ""E. J."" Henderson (born August 3, 1980...",who,NATIONALITY,American,"{'Eric': 'Eric', 'N.': 'N.', '""E.': '""E.', 'J....","Eric N. ""E. J."" Henderson (born August 3, 1980..."
5389,"Eric N. ""E. J."" Henderson (born August 3, 1980...",who,DATE_OF_BIRTH,"August 3, 1980","{'Eric': 'Eric', 'N.': 'N.', '""E.': '""E.', 'J....","Eric N. ""E. J."" Henderson (born August 3, 1980..."
5390,"Eric N. ""E. J."" Henderson (born August 3, 1980...","Eric N. ""E. J."" Henderson",DATE_OF_BIRTH,"August 3, 1980","{'Eric': 'Eric', 'N.': 'N.', '""E.': '""E.', 'J....","Eric N. ""E. J."" Henderson (born August 3, 1980..."


In [13]:
# Mapping/tagging entities in sentances

df_res = df_temp.copy()
df_res['target'] = ''
for i in range(len(df_res)):
    
    st = df_res.iloc[i,0]
    sub = df_res.iloc[i,1]
    obj = df_res.iloc[i,3]
    
    sub_pos = st.find(sub)
    obj_pos = st.find(obj)
    
    if sub_pos < obj_pos:

        st = st[:sub_pos] + '<e1> ' + sub + ' </e1>' + st[sub_pos+len(sub):] 
        diff = len(st) - len(df_res.iloc[i,0])
        st = st[:obj_pos+diff] + '<e2> ' + obj + ' </e2>' + st[diff+obj_pos+len(obj):] 

#         st = st.replace(sub, '<e1> ' + sub + ' </e1>')
#         st = st.replace(obj, '<e2> ' + obj + ' </e2>')
        df_res.iloc[i,0] = st
        df_res.iloc[i,-1] = df_res.iloc[i,2] + '(e1,e2)'

    else:
        st = st[:obj_pos] + ' <e2> ' + obj + ' </e2> ' + st[obj_pos+len(obj):] 
        diff = len(st) - len(df_res.iloc[i,0])
        st = st[:sub_pos+diff] + '<e1> ' + sub + ' </e1>' + st[diff+sub_pos+len(sub):] 

#         st = st.replace(sub, '<e1> ' + sub + ' </e1>')
#         st = st.replace(obj, '<e2> ' + obj + ' </e2>')
        df_res.iloc[i,0] = st
        df_res.iloc[i,-1] = df_res.iloc[i,2] + '(e2,e1)'


In [14]:
# Debugging the data

idx=[]
for i in range(len(df_res['Sentences'])):
    sent = df_res['Sentences'][i]#.split()
    flag = False
    
    if '<e1>' not in sent:
        flag = True
    elif '</e1>' not in sent:
        flag = True
    elif '<e2>' not in sent:
        flag = True
    elif '</e2>' not in sent:
        flag = True
   
    if flag == True: 
        idx.append(i)

In [14]:
# Unformatted data
print(df_res.loc[idx])

Unnamed: 0,Sentences,Sub,Relation,Obj,coref_dict,coref_sents,target
144,"Prior to <e2> Ogilvy <e1> Ogilvy </e1>, she w...",Ogilvy,EMPLOYEE_OR_MEMBER_OF,Ogilvy,"{'Prior': 'Prior', 'to': 'to', 'Ogilvy,': 'Ogi...","Prior to Ogilvy, Ogilvy was Director of Market...","EMPLOYEE_OR_MEMBER_OF(e2,e1)"
1724,"Born in Jiroft in the province of Kerman, <e2...",I,PLACE_OF_BIRTH,Iran,"{'Born': 'Born', 'in': 'in', 'Jiroft': 'Jiroft...","Born in Jiroft in the province of Kerman, Iran...","PLACE_OF_BIRTH(e2,e1)"
2196,Then we decided to move to <e2> Italy <<e1> I...,I,PLACE_OF_RESIDENCE,Italy,"{'Then': 'Then', 'we': 'we', 'decided': 'decid...","Then we decided to move to Italy, Rome where I...","PLACE_OF_RESIDENCE(e2,e1)"
2201,Then we decided to move to <e2> Italy <<e1> I...,I,PLACE_OF_RESIDENCE,Italy,"{'Then': 'Then', 'we': 'we', 'decided': 'decid...","Then we decided to move to Italy, Rome where I...","PLACE_OF_RESIDENCE(e2,e1)"
2571,I worked for <e2> TCS </e<e1> TCS </e1> for n...,TCS,EMPLOYEE_OR_MEMBER_OF,TCS,"{'I': 'I', 'worked': 'worked', 'for': 'for', '...",I worked for TCS for nearly 12-13 months but t...,"EMPLOYEE_OR_MEMBER_OF(e2,e1)"
2573,I worked for <e2> TCS </e<e1> TCS </e1> for n...,TCS,EMPLOYEE_OR_MEMBER_OF,TCS,"{'I': 'I', 'worked': 'worked', 'for': 'for', '...",I worked for TCS for nearly 12-13 months but t...,"EMPLOYEE_OR_MEMBER_OF(e2,e1)"
2575,I worked for <e2> TCS </e<e1> TCS </e1> for n...,TCS,EMPLOYEE_OR_MEMBER_OF,TCS,"{'I': 'I', 'worked': 'worked', 'for': 'for', '...",I worked for TCS for nearly 12-13 months but t...,"EMPLOYEE_OR_MEMBER_OF(e2,e1)"
2576,I worked for <e2> TCS </e<e1> TCS </e1> for n...,TCS,EMPLOYEE_OR_MEMBER_OF,TCS,"{'I': 'I', 'worked': 'worked', 'for': 'for', '...",I worked for TCS for nearly 12-13 months but t...,"EMPLOYEE_OR_MEMBER_OF(e2,e1)"
3876,Dr. Vincent Guido was born and raised in the s...,Lafayette,PLACE_OF_RESIDENCE,West Lafayette,"{'Dr.': 'Dr.', 'Vincent': 'Vincent', 'Guido': ...",Dr. Vincent Guido was born and raised in the s...,"PLACE_OF_RESIDENCE(e2,e1)"
3970,At <e2> Openwav<e1> Openwave </e1> he was Cou...,Openwave,EMPLOYEE_OR_MEMBER_OF,Openwave,"{'At': 'At', 'Openwave': 'Openwave', 'he': 'Op...",At Openwave Openwave was Country Manager for I...,"EMPLOYEE_OR_MEMBER_OF(e2,e1)"


In [15]:
# split data in train-test
df_res = df_res.drop(index=idx)
df_res.drop_duplicates(subset=['Sub','Relation','Obj'], inplace=True)
X = df_res['Sentences']
y = df_res['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train

1822    Born in Chicago,  <e2> Illinois </e2> , <e1> B...
3705    He, along with his wife, <e1> Linda </e1> and ...
3700    He, along with his wife, Linda and daughter <e...
1471    <e1> Hristoforos Hoidis </e1> (born <e2> 10 Se...
264     <e1> Mihajlo Obrenov </e1> aka miKKa is a <e2>...
                              ...                        
5206    Shinya Katabuchi (片渕 慎弥 <e1> Katabuchi Shin'ya...
539     William Welles Hollister (1818–1886) was a nat...
3620    Arthur Kornberg ( <e2> March 3, 1918 </e2>  – ...
4441    <e1> Mark Robinson </e1> (born November 21, 19...
1005    Today, The  <e2> Bank of Princeton </e2>  team...
Name: Sentences, Length: 3656, dtype: object

In [17]:
# Save train data
df_train = pd.DataFrame()
df_train['target'] = y_train
df_train['Sentences'] = X_train

df_train.to_csv("train.tsv", sep="\t",index=False,header=False)

In [18]:
# Save train test

df_test = pd.DataFrame()
df_test['target'] = y_test
df_test['Sentences'] = X_test

df_test.to_csv("test.tsv", sep="\t",index=False,header=False)

In [19]:
# final test set for prediction given sentance after training

df_test.iloc[:].drop('target',axis=1).to_csv("final_test.tsv", sep="\t",index=False,header=False)

In [22]:
# for prediction of labels on test set

with open('answer_keys.txt', "w", encoding="utf-8") as f:
    for idx, pred in enumerate(df_test['target'][:]):
        if idx <= 100:
            pred = 'NATIONALITY(e1,e2)'
        f.write("{}\t{}\n".format(8001 + idx, pred))

In [23]:
with open('answer_keys.txt', "w", encoding="utf-8") as f:
    for idx, pred in enumerate(df_test['target']):
        f.write("{}\t{}\n".format(8001 + idx, pred))

In [24]:
er = "Then we decided to move to  <e2> Italy <<e1> I </e1>e2> , Rome where I continued my studies in an American School until I almost graduated High School."

In [25]:
df_res[df_res['Sentences'] == er]

Unnamed: 0,Sentences,Sub,Relation,Obj,coref_dict,coref_sents,target


In [26]:
df_temp.iloc[4327,0]

"Stata is an alumnus of MIT, where he received bachelor's and master's degrees in Computer Science and Engineering in 1991, and a Ph.D. in 1996 under advisor John Guttag."

In [27]:
# 4327 
df_res.iloc[4,-2]

'When I found myself downsized in 2010, I decided it was time to combine my photo/video hobby with my technical knowledge, so I enrolled at Specs Howard School of Media Arts, where I studied Digital Media Arts (DMA).'

In [28]:
df_res = df_temp.copy()
i = 4327
st = df_res.iloc[i,0]
sub = df_res.iloc[i,1]
obj = df_res.iloc[i,3]

sub_pos = st.find(sub)
obj_pos = st.find(obj)

## Relation Extraction

In [29]:
import opennre

# download NRE pretrained model
model = opennre.get_model('wiki80_cnn_softmax')

# text used to look for relations
text = """
Kobe Bean Bryant was an American professional basketball player.
A shooting guard, he spent his entire career with the Los Angeles Lakers in the NBA.
"""

# choose two entities whose relation is to be predicted
h_text = "Kobe Bean Bryant"
t_text = "shooting guard"
h_pos = (text.index(h_text), text.index(h_text) + len(h_text))
t_pos = (text.index(t_text), text.index(t_text) + len(t_text))

# predict relation
model.infer({'text': text, 'h': {'pos': h_pos}, 't': {'pos': t_pos}})

2022-04-23 19:04:12,375 - root - INFO - Initializing word embedding with word2vec.


('position played on team / speciality', 0.9829357862472534)