In [1]:
import os
import pandas as pd

from string import Template
from statistics import mean
from data_utils import read_txt, check_trans_for_pairs, create_story_triplets, create_object_info, get_temporal_question, create_yn, create_fr, create_chain, build_data, save_json


## TB-Dense

### Load and preprocess data

In [2]:
# Downloaded from: https://www.usna.edu/Users/cs/nchamber/caevo/#corpus
path = "tb_dense"

In [3]:
tb_dense_lines = read_txt(os.path.join(path, "TimebankDense.full.txt"))
tb_dense_lines[0]

['APW19980227.0476', 't21', 't0', 's']

In [4]:
tb_dense_df = pd.DataFrame(tb_dense_lines, columns=["doc_id", "event1_id", "event2_id", "relation"])
tb_dense_df.head()

Unnamed: 0,doc_id,event1_id,event2_id,relation
0,APW19980227.0476,t21,t0,s
1,APW19980227.0476,t21,e1,i
2,APW19980227.0476,t21,e1996,a
3,APW19980227.0476,t21,e2,a
4,APW19980227.0476,t21,e1997,v


In [5]:
tb_dense_docs = list(tb_dense_df.doc_id.unique())

In [6]:
print("There are", len(tb_dense_docs), "documents with", len(tb_dense_df), "relations in total.")

There are 36 documents with 12715 relations in total.


In [7]:
tb_dense_df.relation.unique()

array(['s', 'i', 'a', 'v', 'ii', 'b'], dtype=object)

In [8]:
rel = {"s": "simultaneous", "i": "includes", "a": "after", "v": "vague", "ii": "is included", "b": "before"}

In [9]:
tb_dense_df["relation"].replace(rel, inplace=True)
tb_dense_df.head()

Unnamed: 0,doc_id,event1_id,event2_id,relation
0,APW19980227.0476,t21,t0,simultaneous
1,APW19980227.0476,t21,e1,includes
2,APW19980227.0476,t21,e1996,after
3,APW19980227.0476,t21,e2,after
4,APW19980227.0476,t21,e1997,vague


In [10]:
# Get pairs and relations of each document
doc_pair_relations = []
for doc_id in tb_dense_df.doc_id.unique():
    doc_tlinks = tb_dense_df.loc[tb_dense_df["doc_id"]==doc_id]
    doc_pairs = list(zip(doc_tlinks.event1_id.to_list(), doc_tlinks.event2_id.to_list()))
    doc_relations = doc_tlinks.relation.to_list()
    doc_pair_relations.append(dict(zip(doc_pairs, doc_tlinks.relation.to_list())))


In [11]:
len(doc_pair_relations)

36

In [12]:
list(doc_pair_relations[0].items())[0]

(('t21', 't0'), 'simultaneous')

In [13]:
num_pairs = []
for p in doc_pair_relations:
    num_pairs.append(len(p))
    
print("The number of pairs in a report on average is", mean(num_pairs))

The number of pairs in a report on average is 353.19444444444446


In [14]:
# Get trans triples
trans_counts = []
trans_pairs = []
num_trans = []
for pr in doc_pair_relations:
    report_pairs = list(pr.keys())
    t_count, t_pairs = check_trans_for_pairs(report_pairs)
    trans_counts.append(t_count)
    trans_pairs.append(t_pairs)
    for triple in t_pairs:
        #if pr[triple[0]] != "vague" and pr[triple[1]] != "vague" and pr[triple[2]] != "vague":
        num_trans.append(1)

print("Transitivity appears on average:", mean(trans_counts))

Transitivity appears on average: 1292.9722222222222


In [15]:
print("There are", sum(num_trans), "transitivity triples")

There are 46547 transitivity triples


In [16]:
len(trans_pairs[0])

1064

In [17]:
trans_pairs[0][0]

[('t21', 'e1'), ('e1', 't0'), ('t21', 't0')]

### Construct story_triplets

In [18]:
doc_story_triplets = create_story_triplets(doc_pair_relations)

In [19]:
len(doc_story_triplets[0]) == len(doc_pair_relations[0])

True

In [20]:
len(doc_story_triplets[0])

212

### Construct objects_info

In [21]:
doc_objects_info = create_object_info(tb_dense_df)

In [22]:
len(doc_objects_info)

36

In [23]:
list(doc_objects_info[0].items())[0]

('t21', {'name': '', 'full_name': ''})

### Construct chains

In [24]:
inverse = {"before": "after",
           "after": "before",
           "includes": "is included",
           "is included": "includes",
           "overlap": "overlap",
           "simultaneous": "simultaneous",
           "vague": "vague"
          }

trans_triples = trans_pairs

In [25]:
doc_chains = create_chain(doc_pair_relations, trans_pairs, inverse)

In [26]:
len(doc_chains[0])

212

In [27]:
list(doc_chains[0].items())[:20]

[(('t21', 't0'),
  {'num_facts': 2,
   'reasoning_steps': 1,
   'chain': [[('t21', 'e1'),
     {'relation_type': 'includes', 'relation_property': ''}],
    [('e1', 't0'), {'relation_type': 'is included', 'relation_property': ''}]],
   'goal_chain': [['t21',
     'e1',
     {'relation_type': 'includes', 'relation_property': ''}],
    ['e1', 't0', {'relation_type': 'is included', 'relation_property': ''}]]}),
 (('t21', 'e1'),
  {'num_facts': 1,
   'reasoning_steps': 1,
   'chain': [[['e1', 't21'],
     {'relation_type': 'is included', 'relation_property': ''}]],
   'goal_chain': [['t21',
     'e1',
     {'relation_type': 'includes', 'relation_property': ''}]]}),
 (('t21', 'e1996'),
  {'num_facts': 2,
   'reasoning_steps': 1,
   'chain': [[('t21', 'e1'),
     {'relation_type': 'includes', 'relation_property': ''}],
    [('e1', 'e1996'), {'relation_type': 'after', 'relation_property': ''}]],
   'goal_chain': [['t21',
     'e1',
     {'relation_type': 'includes', 'relation_property': ''}],


### Construct questions

In [28]:
relation_set = list(tb_dense_df.relation.unique())
relation_set

['simultaneous', 'includes', 'after', 'vague', 'is included', 'before']

In [29]:
doc_questions = []
for doc_index, doc_id in enumerate(tb_dense_df.doc_id.unique()):
    # Get each document's tlinks
    doc_tlinks = tb_dense_df.loc[tb_dense_df["doc_id"] == doc_id]
    
    questions = []
    
    # Create the question counter
    q_id = 0
    for index, row in doc_tlinks.iterrows():
        # Get the pair of events
        query = (row["event1_id"], row["event2_id"])
        
        # Add YN question (one for each relation)
        yn_questions, yn_answers = create_yn(query, row["relation"], relation_set)
        
        for i, yn_question in enumerate(yn_questions):
            question_info = {"num_facts": doc_chains[doc_index][query]["num_facts"],
                "reasoning_steps": doc_chains[doc_index][query]["reasoning_steps"],
                "asked_relation": relation_set[i],
                "all_relations": [row["relation"]],
                "target_relation": [row["relation"]],
                "chain": doc_chains[doc_index][query]["chain"],
                "goal_chain": doc_chains[doc_index][query]["goal_chain"]
                }
            
            questions.append({"q_id": q_id,
                        "q_type": "YN",
                        "query": query,
                        "question_info": question_info,
                        "question": yn_question,
                        "answer": yn_answers[i],
                        "candidate_answers": ["Yes", "No"]
                        })
            q_id += 1
        
        # Add the FR question
        question, answer = create_fr(query, row["relation"])
        question_info = {"num_facts": doc_chains[doc_index][query]["num_facts"],
                "reasoning_steps": doc_chains[doc_index][query]["reasoning_steps"],
                "asked_relation": [row["relation"]],
                "all_relations": [row["relation"]],
                "target_relation": [row["relation"]],
                "chain": doc_chains[doc_index][query]["chain"],
                "goal_chain": doc_chains[doc_index][query]["goal_chain"]
                }

        questions.append({"q_id": q_id,
            "q_type": "FR",
            "query": query,
            "question_info": question_info,
            "question": question,
            "answer": answer,
            "candidate_answers": relation_set
            })
        q_id += 1
    
    doc_questions.append(questions)
    
    

In [30]:
len(doc_questions)

36

In [31]:
len(doc_questions[0])

1484

In [32]:
list(doc_questions[0][6].items())

[('q_id', 6),
 ('q_type', 'FR'),
 ('query', ('t21', 't0')),
 ('question_info',
  {'num_facts': 2,
   'reasoning_steps': 1,
   'asked_relation': ['simultaneous'],
   'all_relations': ['simultaneous'],
   'target_relation': ['simultaneous'],
   'chain': [[('t21', 'e1'),
     {'relation_type': 'includes', 'relation_property': ''}],
    [('e1', 't0'), {'relation_type': 'is included', 'relation_property': ''}]],
   'goal_chain': [['t21',
     'e1',
     {'relation_type': 'includes', 'relation_property': ''}],
    ['e1', 't0', {'relation_type': 'is included', 'relation_property': ''}]]}),
 ('question', 'When did t21 happen in time compared to t0?'),
 ('answer', ['simultaneous']),
 ('candidate_answers',
  ['simultaneous', 'includes', 'after', 'vague', 'is included', 'before'])]

### Save to json

In [33]:
data = build_data(tb_dense_docs, doc_story_triplets, doc_questions, doc_objects_info)

In [34]:
len(data)

36

In [35]:
save_json(path, "tb_dense.json", data)

### Split to train, dev and test

In [None]:
# Taken from CAEVO github repo: https://github.com/nchambers/caevo/blob/master/src/main/java/caevo/Evaluate.java
devDocs = ["APW19980227.0487.tml",
      "CNN19980223.1130.0960.tml", "NYT19980212.0019.tml",
      "PRI19980216.2000.0170.tml", "ed980111.1130.0089.tml"]

In [None]:
testDocs = ["APW19980227.0489.tml",
      "APW19980227.0494.tml", "APW19980308.0201.tml", "APW19980418.0210.tml",
      "CNN19980126.1600.1104.tml", "CNN19980213.2130.0155.tml",
      "NYT19980402.0453.tml", "PRI19980115.2000.0186.tml",
      "PRI19980306.2000.1675.tml"]