In [3]:
import logging
import json
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import nltk
from itertools import combinations
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts

#from factslab.datastructures import ConstituencyTree, DependencyTree
from factslab.pytorch.temporalmodule import TemporalModel, TemporalTrainer
options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)

In [8]:
#import allennlp
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
#from allennlp.commands.elmo import ElmoEmbedder
import pickle
from torch.distributions.binomial import Binomial
from torch.nn import MSELoss, L1Loss, SmoothL1Loss, CrossEntropyLoss

import torch
from torch import nn
#from torchviz import make_dot, make_dot_from_trace

from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm_n

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

## Dependency structures

In [16]:
from nltk import DependencyGraph
import re

def html_ify(s):
    '''
        Takes care of &quot &lsqb &rsqb &#39
    '''
    html_string = re.sub(r'\)', r'&rcrb;', s)
    html_string = re.sub(r'\(', r'&lcrb;', html_string)
    return html_string

def get_structs(file_path):
    files = [file_path]
    structures = {}
    for file in files:
        with open(file, 'r') as f:
            filename = file.split("/")[-1]
            iden = 0
            a = ""
            words = []
            for line in f:
                if line != "\n":
                    a += line
                    words.append(line.split("\t")[1])
                else:
                    iden += 1
                    a = html_ify(a)
                    structure = DependencyGraph(a, top_relation_label='root')
                    sent = " ".join(words)
                    sent = html_ify(sent)
                    sent_id = filename + " sent_" + str(iden)
                    structures[sent_id] = structure
                    a = ""
                    words = []
    return structures

structures = get_structs('../input_data/sample_document.txt.output')

In [17]:
def extract_struct_dicts(structures):
    '''
    Input: A dictionary of DependencyGraph objects with key as the sentence id:
            Key example: sample_corpus_document.txt.output sent_1'
            
    Output: A dictionary of sentences with key as the sentence id:

    '''
    struct_dict = {}
    for key in structures:
        N = len(structures[key].nodes)
        struct_dict[key] = [structures[key].nodes[i]['word']for i in range(1, N)]
        
    return struct_dict

In [18]:
struct_dict = extract_struct_dicts(structures)

In [19]:
struct_dict

{'sample_document.txt.output sent_1': ['Before',
  'the',
  'arrival',
  'of',
  'Keep',
  ',',
  'which',
  'Google',
  'launched',
  'this',
  'week',
  ',',
  'there',
  'was',
  'no',
  'default',
  'note-taking',
  'app',
  'for',
  'Android',
  '.'],
 'sample_document.txt.output sent_2': ['It',
  'was',
  'a',
  'glaring',
  'hole',
  ',',
  'considering',
  'that',
  'Apple',
  "'s",
  'iPhone',
  'has',
  'built-in',
  'Notes',
  'and',
  'Reminders',
  'apps',
  'that',
  'can',
  'be',
  'powered',
  'by',
  'Siri',
  '.'],
 'sample_document.txt.output sent_3': ['Instead',
  'of',
  'settling',
  'for',
  'a',
  'bare',
  'bones',
  'app',
  'to',
  'fill',
  'the',
  'void',
  ',',
  'the',
  'search',
  'giant',
  'took',
  'things',
  'one',
  'step',
  'further',
  '.'],
 'sample_document.txt.output sent_4': ['Keep',
  'is',
  "n't",
  'simply',
  'just',
  'a',
  'place',
  'to',
  'bank',
  'whatever',
  'random',
  'half-thoughts',
  'come',
  'to',
  'mind',
  ':',
  

In [20]:
print(struct_dict['sample_document.txt.output sent_1'])

['Before', 'the', 'arrival', 'of', 'Keep', ',', 'which', 'Google', 'launched', 'this', 'week', ',', 'there', 'was', 'no', 'default', 'note-taking', 'app', 'for', 'Android', '.']


In [21]:
def depth_in_tree(idx, dep_obj):
    '''
    Input: Index of the word in a linear sequence of words
    
    Output: Depth of that word in the dependency tree
    
    '''
    nodes = dep_obj.nodes
    depth = 0
    i = idx+1
    while nodes[i]['rel'] != 'root':
        i = nodes[i]['head']
        depth+=1
        
    return depth
            
    
def find_pivot_predicate(fname, sentid_num, predp_object, structures):
    '''
    Find the pivot-predicate of a given sentence's id
    
    Heuristic/Algo:  Follow the root predicate until you find a predicate which doesn't have
                any xcomp, ccomp or csubj dependencies.
                
    '''
    #preds = filter_preds([(sentid_num, x) for x in predp_object.instances])
    preds = [(sentid_num, x) for x in predp_object.instances]
    tokens = [y.root.position for x, y in preds]
    
    if tokens:
        tokens_covered = set()
        
        struct_id = fname + " sent_" + str(sentid_num)
        dep_object = structures[struct_id]
        pred_heights = sorted([(x, depth_in_tree(x,dep_object)) for x in tokens], key=lambda x:x[1])
        tokens_reverse = [x for x,y in pred_heights][::-1]
        
        root_idx = tokens.index(pred_heights[0][0])
        root_predicate = preds[root_idx]
        deps = dep_object.nodes[tokens[root_idx]+1]['deps']
        
        tokens_covered.add(tokens[root_idx])
        tokens_reverse.pop()
        
        while ('ccomp' in deps) or ('xcomp' in deps) or ('csubj') in deps:
            variables = ['ccomp', 'xcomp', 'csubj']
            for var in variables:
                if var in deps:
                    tok_idx = deps[var][0]-1
                    if tok_idx in tokens:
                        root_idx = tokens.index(tok_idx)
                        tokens_covered.add(tokens[root_idx])
                        tokens_reverse.pop()
                    else:
                        if tokens_reverse:
                            root_idx = tokens.index(tokens_reverse[-1])
                            tokens_covered.add(tokens[root_idx])
                            tokens_reverse.pop()
                        else:
                            return root_predicate
                    break
                    
            deps = dep_object.nodes[tokens[root_idx]+1]['deps']
            root_predicate = preds[root_idx]
            
        return root_predicate 

    return []


In [22]:
def filter_preds(pred_tuples):
    '''
    Input: a list of tuples of (sent_id_num, predicate object)
    
    Output: filter tuples only with specific pos tags predicates
    
    '''
    ans = []
    pos_tags = set(["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"])
    for sent_id, pred_obj in pred_tuples:
        if pred_obj.root.tag not in pos_tags:
            #print(pred_obj.root.tag)
            #print("not in pos tags")
            continue
        elif pred_obj.root.tag not in ["VERB", "AUX"]:
            gov_rels = [tok.gov_rel for tok in pred_obj.tokens]
            if 'cop' in gov_rels:
                ans.append((sent_id, pred_obj))
            elif pred_obj.root.tag == 'ADJ':
                ans.append((sent_id, pred_obj))
        else:
            ans.append((sent_id, pred_obj))
    return ans

def predicate_info(predicate):
    '''
    Input: predicate object
    Output: pred_text, token, root_token
    
    Note: If predicate is copular: pred_text is only upto first 5 words
    '''      
    copula_bool = False
    
    #Extend predicate to start from the copula
    if predicate.root.tag not in ["VERB", "AUX"]:
        all_pred = predicate.tokens
        gov_rels = [tok.gov_rel for tok in all_pred]
        if 'cop' in gov_rels:
            copula_bool = True
            cop_pos = gov_rels.index('cop')
            pred = [x.text for x in all_pred[cop_pos:]]
            pred_token = [x.position for x in all_pred[cop_pos:]]
            def_pred_token = predicate.root.position  #needed for it_happen set
            cop_bool = True  
            #print(predicate, idx)
            
        elif predicate.root.tag == "ADJ":
            pred_token = [predicate.root.position]
            pred = [predicate.root.text]
            def_pred_token = predicate.root.position
        else: ## Different from protocol as we are considering all predicates
            pred_token = [predicate.root.position]
            pred = [predicate.root.text]
            def_pred_token = predicate.root.position
            
    #Else keep the root        
    else:
        pred_token = [predicate.root.position]
        pred = [predicate.root.text]
        def_pred_token = predicate.root.position 

    #Stringify pred and pred_tokens:
    #pred_token = "_".join(map(str, pred_token))

    if len(pred)>5:
        pred = pred[:5]
        pred = " ".join(pred) + "..."
    else:
        pred = " ".join(pred)
    
    return pred, pred_token, def_pred_token

In [23]:
def dict_pred_double(pred_comb, raw_sentence, fname, sentid_num, sentid_num_next):
    '''
    Extract turk_parse dict from input predicate combination 
    
    INputs:
    1. pred_all : one list of all predicates in both sentences
    2. raw_sentence: a dict of two sentences, with key: sent_id_num
    3. sentid_num: 1st sentence in adjacent sentence
    4. sentid_num_next: 2nd sentence in adjacent sentence
    
    '''
    token_dict = {}
    pred1_obj, pred2_obj = [y for x,y in pred_comb]
    sent_id1, sent_id2 = [x for x,y in pred_comb]
    
    pred1_text, pred1_token, pred1_root_token = predicate_info(pred1_obj)
    pred2_text, pred2_token, pred2_root_token = predicate_info(pred2_obj)

    token_dict['pred1_token'] = "_".join(map(str, pred1_token))
    token_dict['pred1_text'] = pred1_text
    token_dict['pred2_token'] = "_".join(map(str, pred2_token))
    token_dict['pred2_text'] = pred2_text
    token_dict['sentence_id_1'] = fname + " " + sent_id1
    token_dict['sentence_id_2'] = fname + " " + sent_id2
    token_dict['pred1_root_token'] = pred1_root_token
    token_dict['pred2_root_token'] = pred2_root_token
      
    pred_sentence = raw_sentence[sentid_num] + raw_sentence[sentid_num_next]
    token_dict['sentence'] = " ".join(pred_sentence)
        
    return token_dict, pred1_token, pred2_token


In [24]:
def extract_dataframe(file_path):
    '''
    Input: Input document file path which contains conllu format 
            sentences separated by '\n'
    
    Output: A dataframe after processing the file through PredPatt and exracting
             roots and spans of each predicate. 
             Each row in the dataframe corresponds to an event-pair
    '''
    
    with open(file_path) as infile:
        data = infile.read()
        parsed = [(PredPatt(ud_parse, opts=options), sent_id) for sent_id, ud_parse in load_conllu(data)]
        print("Number of sentences in the document: {}".format(len(parsed)))

    fname = file_path.split("/")[-1]
    
    total_preds = 0
    global_tuples = []
    sent_total=0
    struct_dict
    
    total_sents_doc = len(parsed)
    for i, parse_sen in enumerate(parsed):
        pred_object = parse_sen[0] 
        total_preds += len(pred_object.instances)
        sentid_num = parse_sen[1].split("_")[-1]
        #print(sentid_num)
        
        ## Concatenate adjacent sentences
        if i < total_sents_doc-1:
            parse_sen_next = parsed[i+1]
            pred_object_next = parse_sen_next[0]
            sentid_num_next = parse_sen_next[1].split("_")[-1]

            raw_sentence =  {sentid_num : [token.text for token in pred_object.tokens] ,
                            sentid_num_next: [token.text for token in pred_object_next.tokens]}

            preds_curr = [(sentid_num,pred) for pred in pred_object.instances]
            preds_next = [(sentid_num_next,pred) for pred in pred_object_next.instances]

            #Curr_sent combinations (all possible)
            pred_combs_curr = combinations(preds_curr,2)
            for pred_comb in pred_combs_curr:      
                #token dict from all predicates in the antecedent sentence:
                token_dict, pred_token1, pred_token2 = dict_pred_double(pred_comb, raw_sentence, 
                                                                              fname, sentid_num, 
                                                                                sentid_num_next)
                global_tuples.append((token_dict, pred_token1, pred_token2))
                sent_total+=1

            #Combinations of Pivot predicate of curr_sent with predicates of next_sent:
            pivot_curr_pred = find_pivot_predicate(fname, sentid_num, pred_object, structures)
            print("Pivot predicate: {}".format(pivot_curr_pred))
            if pivot_curr_pred:
                for tupl in preds_next:
                    pred_comb = [pivot_curr_pred, tupl]
                    token_dict, pred_token1, pred_token2 = dict_pred_double(pred_comb, raw_sentence, 
                                                                                  fname, sentid_num, 
                                                                                    sentid_num_next)
                    global_tuples.append((token_dict, pred_token1, pred_token2))
                    sent_total+=1
       
    ## Create a dataframe from the global tuples dictionary
    dcts = [dct for dct, pred1_span, pred2_span in global_tuples]
    pred1_spans = [pred1_span for dct, pred1_span, pred2_span in global_tuples]
    pred2_spans = [pred2_span for dct, pred1_span, pred2_span in global_tuples]
                    
    df = pd.DataFrame(dcts)
    df['pred1_span'] = np.array(pred1_spans)
    df['pred2_span'] = np.array(pred2_spans)
    
    return df 


In [26]:
df = extract_dataframe("../input_data/sample_document.txt.output")

Number of sentences in the document: 23
Pivot predicate: ('1', Predicate(was/13))
Pivot predicate: ('2', Predicate(hole/4))
Pivot predicate: ('3', Predicate(took/16))
Pivot predicate: ('4', Predicate(construct/18))
Pivot predicate: ('5', Predicate(accessible/12))
Pivot predicate: ('6', Predicate(save/4))
Pivot predicate: ('7', Predicate(progressive/6))
Pivot predicate: ('8', Predicate(presented/4))
Pivot predicate: ('9', Predicate(Swiping/0))
Pivot predicate: ('10', Predicate(is/6))
Pivot predicate: ('11', Predicate(edit/24))
Pivot predicate: ('12', Predicate(frictionless/4))
Pivot predicate: ('13', Predicate(said/1))
Pivot predicate: ('14', Predicate(limited/3))
Pivot predicate: ('15', Predicate(email/7))
Pivot predicate: ('16', Predicate(function/5))
Pivot predicate: ('17', Predicate(expect/5))
Pivot predicate: ('18', Predicate(foresee/4))
Pivot predicate: ('19', Predicate(turn/25))
Pivot predicate: ('20', Predicate(scare/2))
Pivot predicate: ('21', Predicate(are/13))
Pivot predicate

In [27]:
df.head()

Unnamed: 0,pred1_root_token,pred1_text,pred1_token,pred2_root_token,pred2_text,pred2_token,sentence,sentence_id_1,sentence_id_2,pred1_span,pred2_span
0,4,Keep,4,8,launched,8,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],[8]
1,4,Keep,4,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],[13]
2,8,launched,8,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[8],[13]
3,13,was,13,4,was a glaring hole,1_2_3_4,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],"[1, 2, 3, 4]"
4,13,was,13,6,considering,6,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],[6]


In [28]:
out_data = df

## Extract Pred2 token positions in the combined sentence

In [29]:
def correct_pred2_root(row, struct_dict):
    if row.sentence_id_1 == row.sentence_id_2:
        return row.pred2_root_token
    else:
        sent_str, num = row.sentence_id_1.split(" ")
        sent_name = sent_str + " " + "sent_" + num
        
        return len(struct_dict[sent_name]) + row.pred2_root_token 
    
def correct_pred2_tokens(row, struct_dict):
    if row.sentence_id_1 == row.sentence_id_2:
        return row.pred2_token
    else:
        sent_str, num = row.sentence_id_1.split(" ")
        sent_name = sent_str + " " + "sent_" + num
        
        curr_posns = [int(x) for x in row.pred2_token.split("_")]
        new_posns = [len(struct_dict[sent_name]) + x for x in curr_posns]
        
        return "_".join([str(x) for x in new_posns])
    
df['pred2_token_mod'] = df.apply(lambda row: correct_pred2_tokens(row, struct_dict), axis=1)
df['pred2_root_token_mod'] = df.apply(lambda row: correct_pred2_root(row, struct_dict), axis=1)

#Convert tokens into list of numbers
df['pred1_token_span'] = df['pred1_token'].map(lambda x: [int(y) for y in x.split("_")])
df['pred2_token_span'] = df['pred2_token_mod'].map(lambda x: [int(y) for y in x.split("_")])

In [30]:
df.head()

Unnamed: 0,pred1_root_token,pred1_text,pred1_token,pred2_root_token,pred2_text,pred2_token,sentence,sentence_id_1,sentence_id_2,pred1_span,pred2_span,pred2_token_mod,pred2_root_token_mod,pred1_token_span,pred2_token_span
0,4,Keep,4,8,launched,8,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],[8],8,8,[4],[8]
1,4,Keep,4,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],[13],13,13,[4],[13]
2,8,launched,8,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[8],[13],13,13,[8],[13]
3,13,was,13,4,was a glaring hole,1_2_3_4,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],"[1, 2, 3, 4]",22_23_24_25,25,[13],"[22, 23, 24, 25]"
4,13,was,13,6,considering,6,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],[6],27,27,[13],[27]


In [31]:
def extract_X(data):
    sents = data.sentence.values
    structures = [x.split() for x in sents]
    root_idxs = data[['pred1_root_token', 'pred2_root_token_mod']].values
    span_idxs = data[['pred1_token_span', 'pred2_token_span']].values

    X_data = list(zip(structures, span_idxs, root_idxs))
    print("Data size: {}".format(len(X_data)))
    
    return X_data

In [32]:
X = extract_X(df)

Data size: 139


## Extract X data

In [33]:
print(X[3])

(['Before', 'the', 'arrival', 'of', 'Keep', ',', 'which', 'Google', 'launched', 'this', 'week', ',', 'there', 'was', 'no', 'default', 'note-taking', 'app', 'for', 'Android', '.', 'It', 'was', 'a', 'glaring', 'hole', ',', 'considering', 'that', 'Apple', "'s", 'iPhone', 'has', 'built-in', 'Notes', 'and', 'Reminders', 'apps', 'that', 'can', 'be', 'powered', 'by', 'Siri', '.'], array([list([13]), list([22, 23, 24, 25])], dtype=object), array([13, 25]))


## Predict using the model

### Load the best Model

In [34]:
squashed = True
baseline=False
loss_confidence = True

In [36]:
cuda_device_num = 1
cuda_device_str = "cuda:1"

model_path = "../model/"
def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")

In [37]:
file_path = "model_param_param_param_1_0_128_128_0_0_0_0_0.0_0.5_relu_1.pth"

tokens = file_path.split("_")
eventatt = tokens[1]
duratt = tokens[2]
relatt = tokens[3]
concat_fine_to_dur = str2bool(tokens[-8])
concat_dur_to_fine = str2bool(tokens[-7])
fine_2_dur = str2bool(tokens[-6])
dur_2_fine = str2bool(tokens[-5])
weight = float(tokens[-4])
drop = float(tokens[-3])
activ = tokens[-2]
bino_bool = str2bool(tokens[-1].split(".")[0])
#coarse_size = int(tokens[-1].split(".")[0])

print("Eventatt: {}, Duratt: {}, Relatt: {}, Dropout: {}, Activation: {}, Binomial: {}, concat_fine2dur: {}, concat_dur2fine:{}, fine_to_dur: {}, dur_to_fine: {} \n".format(
                                                                                                                        eventatt,
                                                                                                                        duratt,
                                                                                                                        relatt,
                                                                                                                        drop,
                                                                                                                        activ,
                                                                                                                        bino_bool,
                                                                                                                        concat_fine_to_dur,
                                                                                                                        concat_dur_to_fine,
                                                                                                                        fine_2_dur,
                                                                                                       dur_2_fine))
device = torch.device(cuda_device_str if torch.cuda.is_available() else "cpu")

best_model = TemporalModel(
                            embedding_size=1024, 
                            duration_distr = bino_bool,
                            elmo_class = ElmoEmbedder(options_file, weight_file, cuda_device=cuda_device_num),
                            mlp_dropout = drop,
                            mlp_activation= activ,
                            tune_embed_size=256,
                            event_attention=eventatt, 
                            dur_attention = duratt, 
                            rel_attention = relatt, 
                            concat_fine_to_dur  =concat_fine_to_dur,                      
                            concat_dur_to_fine = concat_dur_to_fine,
                            fine_to_dur = fine_2_dur,
                            dur_to_fine = dur_2_fine,
                            fine_squash = True,
                            baseline=False,
                            dur_MLP_sizes = [128], fine_MLP_sizes = [128],
                            dur_output_size = 11, fine_output_size = 4,
                            device= device)

best_model.load_state_dict(torch.load(model_path + file_path))
best_model.to(device)

Eventatt: param, Duratt: param, Relatt: param, Dropout: 0.5, Activation: relu, Binomial: True, concat_fine2dur: False, concat_dur2fine:False, fine_to_dur: False, dur_to_fine: False 



NameError: name 'ElmoEmbedder' is not defined

In [22]:
def predict_fine_dur_only(data_x, model, predict_batch_size = 80):
    '''
    Predict duration and coarse-grained relations
    '''
    # Turn on evaluation mode which disables dropout.
    model.eval()
    
    with torch.no_grad():  
        bidx_i = 0
        bidx_j = predict_batch_size
        total_obs = len(data_x)
        p1_dur_yhat = torch.zeros(total_obs, 11).to(model.device)
        p2_dur_yhat = torch.zeros(total_obs, 11).to(model.device)
        # coarse_yhat = torch.zeros(total_obs, 13).to(model.device)
        # coarser_yhat = torch.zeros(total_obs, 7).to(model.device)
        fine_yhat = torch.zeros(total_obs, 4).to(model.device)
        rel_yhat = torch.zeros(total_obs, 1280).to(model.device)

        while bidx_j < total_obs:
            words = [p for p,q,r in data_x[bidx_i:bidx_j]]
            spans = [q for p,q,r in data_x[bidx_i:bidx_j]]
            roots = [r for p,q,r in data_x[bidx_i:bidx_j]]
            predicts = model(words,spans, roots) 
            # print(predicts[0].size())
            # print(p1_dur_yhat[bidx_i:bidx_j].size())
            # print("\n")
            p1_dur_yhat[bidx_i:bidx_j] = predicts[0]
            p2_dur_yhat[bidx_i:bidx_j] = predicts[1]
            # coarse_yhat[bidx_i:bidx_j] = predicts[3]
            # coarser_yhat[bidx_i:bidx_j] = predicts[4]
            fine_yhat[bidx_i:bidx_j] = predicts[2]
            rel_yhat[bidx_i:bidx_j] = predicts[3]
            
            bidx_i = bidx_j
            bidx_j = bidx_i + predict_batch_size

            if bidx_j >= total_obs:
                words = [p for p,q,r in data_x[bidx_i:bidx_j]]
                spans = [q for p,q,r in data_x[bidx_i:bidx_j]]
                roots = [r for p,q,r in data_x[bidx_i:bidx_j]]
                predicts = model(words,spans, roots) 
                p1_dur_yhat[bidx_i:bidx_j] = predicts[0]
                p2_dur_yhat[bidx_i:bidx_j] = predicts[1]
                # coarse_yhat[bidx_i:bidx_j] = predicts[3]
                # coarser_yhat[bidx_i:bidx_j] = predicts[4]
                fine_yhat[bidx_i:bidx_j] = predicts[2]
                rel_yhat[bidx_i:bidx_j] = predicts[3]

        p1_dur_yhat = F.softmax(p1_dur_yhat, dim=1)
        p2_dur_yhat = F.softmax(p2_dur_yhat, dim=1)
        # coarse_yhat = F.softmax(coarse_yhat, dim=1)
        # coarser_yhat = F.softmax(coarser_yhat, dim=1)

        _ , p1_dur_yhat =  p1_dur_yhat.max(1)
        _ , p2_dur_yhat =  p2_dur_yhat.max(1)
        # _ , coarse_yhat =  coarse_yhat.max(1)
        # _ , coarser_yhat =  coarser_yhat.max(1)

    return p1_dur_yhat.detach(), p2_dur_yhat.detach(), fine_yhat.detach(), rel_yhat.detach()

In [23]:
p1_dur_yhat,p2_dur_yhat,fine_yhat,rel_yhat = predict_fine_dur_only(X, best_model)

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))


## Create a prediction dataset

In [37]:
df['pred1_duration'] = p1_dur_yhat.cpu().numpy()
df['pred2_duration'] = p2_dur_yhat.cpu().numpy()
df['b1'] = [b1 for b1,d1,b2,d2 in fine_yhat.cpu().numpy()]
df['d1'] = [d1 for b1,d1,b2,d2 in fine_yhat.cpu().numpy()]
df['e1'] = df['b1'] + df['d1']

df['b2'] = [b2 for b1,d1,b2,d2 in fine_yhat.cpu().numpy()]
df['d2'] = [d2 for b1,d1,b2,d2 in fine_yhat.cpu().numpy()]
df['e2'] = df['b2'] + df['d2']

df = df.drop(['d1', 'd2'], axis=1)

In [38]:
df['sent_pred_id1'] = df['sentence_id_1'] + " " + df['pred1_root_token'].map(lambda x: str(x))
df['sent_pred_id2'] = df['sentence_id_2'] + " " + df['pred2_root_token'].map(lambda x: str(x))

In [39]:
df.head()

Unnamed: 0,pred1_root_token,pred1_text,pred1_token,pred2_root_token,pred2_text,pred2_token,sentence,sentence_id_1,sentence_id_2,pred1_span,...,beg1,dur1,beg2,dur2,sent_pred_id1,sent_pred_id2,b1,b2,e1,e2
0,4,Keep,4,8,launched,8,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],...,0.0,1.0,0.138038,0.835664,sample_document.txt.output 1 4,sample_document.txt.output 1 8,0.0,0.138038,1.0,0.973703
1,4,Keep,4,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[4],...,0.0,0.772453,0.051323,0.948677,sample_document.txt.output 1 4,sample_document.txt.output 1 13,0.0,0.051323,0.772453,1.0
2,8,launched,8,13,was,13,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 1,[8],...,0.0,0.683801,0.350981,0.649019,sample_document.txt.output 1 8,sample_document.txt.output 1 13,0.0,0.350981,0.683801,1.0
3,13,was,13,4,was a glaring hole,1_2_3_4,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],...,0.0,0.713067,0.182615,0.817385,sample_document.txt.output 1 13,sample_document.txt.output 2 4,0.0,0.182615,0.713067,1.0
4,13,was,13,6,considering,6,"Before the arrival of Keep , which Google laun...",sample_document.txt.output 1,sample_document.txt.output 2,[13],...,0.0,0.772952,0.370259,0.629741,sample_document.txt.output 1 13,sample_document.txt.output 2 6,0.0,0.370259,0.772952,1.0


## Document Timeline

In [27]:
class TimelineModel(torch.nn.Module):
    '''
     A class to extract a simple timeline model from a
     given document's predicate-pair data
    '''
    def __init__(self,
                 data = None,
                 num_preds = None, 
                 mlp_activation='relu',
                 mlp_dropout=0.0,
                 optimizer_class = torch.optim.Adam,
                  dur_output_size = 11, fine_output_size = 4,
                device=device,
                **kwargs):
        super().__init__()

        self.device = device
        self.linear_maps = nn.ModuleDict()
        self.mlp_activation = mlp_activation
        self.mlp_dropout =  nn.Dropout(mlp_dropout) 
        self.dur_output_size = dur_output_size
        
        ## Parameters
            # Hidden predicate representations
        self.pred_tensor = torch.nn.Parameter(torch.randn(num_preds,2), requires_grad=True)
            # Binomial parameter
        self.k = torch.nn.Parameter(torch.randn(1), requires_grad=True)
        
        self.params = nn.ParameterList()
        self.params.extend([self.pred_tensor, self.k])
        
        self._optimizer_class = optimizer_class
        
        ## Losses Initialization
        self.fine_loss = L1Loss().to(self.device)
        self.duration_loss = CrossEntropyLoss().to(self.device)

        
    def _init_MLP(self, input_size, hidden_sizes, output_size, param=None):
        '''
        Initialise MLP or regression parameters
        '''
        self.linear_maps[param] = nn.ModuleList()

        for h in hidden_sizes:
            linmap = torch.nn.Linear(input_size, h)
            linmap = linmap.to(self.device)
            self.linear_maps[param].append(linmap)
            input_size = h

        linmap = torch.nn.Linear(input_size, output_size)
        linmap = linmap.to(self.device)
        self.linear_maps[param].append(linmap)
        
    def forward(self, local_data, **kwargs):
        '''
        INput: dataframe with cols:
                b1, e1, b2, e2, pred1_dict_idx, pred2_dict_idx
                
        Output: 
        '''
        t_sq = self.pred_tensor**2 
        num_preds= t_sq.size()[0]
        anchored_tensor = torch.zeros(num_preds,2).to(self.device)
        
        anchored_tensor[:,0] = t_sq[:,0] - t_sq[:,0].min()
        anchored_tensor[:,1] = t_sq[:,1]
        
        #Predicted fine-grained values for the given document
        b1 = anchored_tensor[local_data.pred1_dict_idx.values][:,0]
        dur1 = anchored_tensor[local_data.pred1_dict_idx.values][:,1]
        b2 = anchored_tensor[local_data.pred2_dict_idx.values][:,0]
        dur2 = anchored_tensor[local_data.pred2_dict_idx.values][:,1]
        
        batch_size = b1.size()[0]
        #print(batch_size)
                
        pred1_dur = self._binomial_dist(dur1)
        pred2_dur = self._binomial_dist(dur2)
        
        yhat = (b1, dur1, b2, dur2, pred1_dur, pred2_dur,
                anchored_tensor)
        
        return yhat
    
    def fit(self, local_data, epochs=5000, **kwargs):
        losses = [10000]
        
        print("#### Model Parameters ####")
        for name,param in self.named_parameters():     
            if param.requires_grad:
                print(name, param.shape) 
        print("##########################") 
        parameters = [p for p in self.parameters() if p.requires_grad]
        optimizer = self._optimizer_class(parameters)
        
        #Actual ground truth values
        b1_lst = local_data.b1.values
        e1_lst = local_data.e1.values
        b2_lst = local_data.b2.values
        e2_lst = local_data.e2.values
        durations = [local_data.pred1_duration.values,
                     local_data.pred2_duration.values]

        
        #pbar = tqdm_n(total = total_obs//self.train_batch_size)
        
        for epoch in tqdm_n(range(epochs)):
            preds = self(local_data)
            #zero_grad
            optimizer.zero_grad()
            curr_loss = self._custom_loss(preds,
                                         b1_lst,
                                         e1_lst,
                                         b2_lst,
                                         e2_lst,
                                         durations)
            
            curr_loss.backward()
            optimizer.step()
            
            if epoch==0:
                print("Epoch: {}, Loss: {}".format(epoch+1, curr_loss))
            
            #print("Epoch: {}, Loss: {}".format(epoch+1, curr_loss))
               
            ## Stop training when loss converges
            if abs(curr_loss.detach() - losses[-1]) < 0.00001:
                #print("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss))
                break
                
            #pbar.update(1)
                
            losses.append(curr_loss.detach())
        #pbar.close()
        print("Epoch: {}, Converging-Loss: {}".format(epoch+1, curr_loss))
                
        return self.predict(preds)
        
    def _custom_loss(self, preds, b1_lst, e1_lst, b2_lst,
                            e2_lst,durations):
        ## Predictions
        b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3]
        out_p1_d, out_p2_d, anchored_tensor = preds[4], preds[5], preds[6]
#         out_coarse, out_coarser = preds[7], preds[8]
        
        ## Ground truth values:
        b1_act, e1_act, b2_act, e2_act = self._lsts_to_tensors(b1_lst, e1_lst, b2_lst, e2_lst,
                                        param="float")
        ## Store actual_y into tensors
        pred1_durs, pred2_durs = durations

        pred1_durs, pred2_durs = self._lsts_to_tensors(pred1_durs,pred2_durs)
        
        ## Duration Losses
        L5_p1 = self.duration_loss(out_p1_d, pred1_durs)
        L5_p2 = self.duration_loss(out_p2_d, pred2_durs)
        #print("L5_p1 {},  L5_p2: {}".format(L5_p1, L5_p2))
        
        ## Coarse Loss
        #L7 = self.coarse_loss(out_coarse, time_ml_coarse)
        #print("L7: {}".format(L7))
        
        ## Coarser Loss
        #L8 = self.coarser_loss(out_coarser, time_ml_coarser)
        #print("L8: {}".format(L8))
        
        ## Normalize predicted fine-grained values:
        num_pairs = b1_pred.size()[0]
        t = torch.zeros(num_pairs,4).to(self.device)
        t[:,0] = b1_pred
        t[:,1] = b1_pred + dur1_pred
        t[:,2] = b2_pred
        t[:,3] = b2_pred + dur2_pred
        
    
        t_min, _ = torch.min(t,dim=1)
        t_min = t_min.unsqueeze(1).repeat(1,4)  #add extra dimension
        t_adj = t - t_min
        t_adj_max, _ = torch.max(t_adj,dim=1)
        t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4)
        t_normalized = t_adj/t_adj_max
        
        ## Fine-grained Losses
        l1 = self.fine_loss(t_normalized[:,0]-t_normalized[:,2], b1_act-b2_act)
        l2 = self.fine_loss(t_normalized[:,1]-t_normalized[:,2], e1_act-b2_act)
        l3 = self.fine_loss(t_normalized[:,3]-t_normalized[:,0], e2_act-b1_act)
        l4 = self.fine_loss(t_normalized[:,1]-t_normalized[:,3], e1_act-e2_act)
        
        L1to4 = sum([l1, l2, l3, l4])/4 
           
        #L5_p1, L5_p2 = 0,0 
        
        #print("L1to4: {}".format(L1to4))
        
        dur = (L5_p1+L5_p2)/2
        fine = L1to4
        beta=2.0
        
        total_loss = (sum([dur, beta*fine])/2)
        
        return total_loss
            
    def _lsts_to_tensors(self, *args, param=None):
        '''
        Input: list1, list2,......

        Output: [Tensor(list1), tensor(list2),....]

        '''
        if param=="float":
            return [torch.from_numpy(np.array(arg)).float().to(self.device) for arg in args]
        else:
            return [torch.from_numpy(np.array(arg, dtype="int64")).to(self.device) for arg in args]
        
    def predict(self, preds):
        b1_pred, dur1_pred, b2_pred, dur2_pred = preds[0], preds[1], preds[2], preds[3]
        pred_timeline =  preds[6]
        
        ## Normalize predicted values:
        num_pairs = b1_pred.size()[0]
        t = torch.zeros(num_pairs,4).to(self.device)
        t[:,0] = b1_pred
        t[:,1] = b1_pred + dur1_pred
        t[:,2] = b2_pred
        t[:,3] = b2_pred + dur2_pred
        
        t_min, _ = torch.min(t,dim=1)
        t_min = t_min.unsqueeze(1).repeat(1,4)  #add extra dimension
        t_adj = t - t_min
        t_adj_max, _ = torch.max(t_adj,dim=1)
        t_adj_max = t_adj_max.unsqueeze(1).repeat(1,4)
        t_normalized = t_adj/t_adj_max
        t_normalized = t_normalized.detach().cpu().numpy()
        
        return t_normalized[:,0],t_normalized[:,1], t_normalized[:,2], t_normalized[:,3], pred_timeline.detach().cpu().numpy()
    
    def _binomial_dist(self, pred_dur):
        '''
        *** Vectorized implementation ***
        Input: A tensor with dimension: batch_size x 1
        Output: A tensor with dimension: batch_size x 11 
        Binomial Prob distribution for a given duration value 
        '''
        pred_dur = torch.sigmoid((self.k)*(torch.log(pred_dur)))
    
        bin_class = Binomial(total_count=self.dur_output_size-1, probs=pred_dur)
        durations = torch.tensor(range(self.dur_output_size), dtype=torch.float).to(self.device)
        
        return self._log_prob_vectorized(bin_class, durations)
        
    def _log_prob_vectorized(self, bin_class, value):
        '''
        1. bin_class: Pytorch Binomial distribution class 
        2. Value is a tensor with size: [total_count+1]
        '''
        batch_size = bin_class.total_count.size()[0]

        value = value.repeat(batch_size,1)
        #print(value.size())

        bin_class.logits = bin_class.logits.repeat(11,1).permute(1,0)
        #print(bin_class.logits.size())

        bin_class.total_count = bin_class.total_count.repeat(11,1).permute(1,0)
        #print(bin_class.total_count.size())

        log_factorial_n = torch.lgamma(bin_class.total_count + 1)
        log_factorial_k = torch.lgamma(value + 1)
        log_factorial_nmk = torch.lgamma(bin_class.total_count - value + 1)
        max_val = (-bin_class.logits).clamp(min=0.0)
        # Note that: torch.log1p(-bin_class.probs)) = max_val - torch.log1p((bin_class.logits + 2 * max_val).exp()))

        return (log_factorial_n - log_factorial_k - log_factorial_nmk +
                value * bin_class.logits + bin_class.total_count * max_val -
                bin_class.total_count * torch.log1p((bin_class.logits + 2 * max_val).exp()))

In [47]:
def extract_preds(data):
    '''
    Extracts a dict of predicates for a given docid data
    Key: pred_sent_id
    Value: predicate-index
    '''
    cols = ['sent_pred_id1', 'sent_pred_id2', 'b1', 'e1', 'b2', 'e2', 
            'pred1_duration', 'pred2_duration', 
            'pred1_text', 'pred2_text']
    
    local_data = data[cols]
    preds_arr = local_data[['sent_pred_id1', 'sent_pred_id2']].values
    uniq_preds = np.unique(preds_arr.flatten())
    
    pred_dict = {}
    idx=0
    for pred in uniq_preds:
        pred_dict[pred]=idx
        idx+=1
        
    local_data['pred1_dict_idx'] = local_data['sent_pred_id1'].map(lambda x: pred_dict[x])
    local_data['pred2_dict_idx'] = local_data['sent_pred_id2'].map(lambda x: pred_dict[x])
        
    return pred_dict, idx, local_data

In [48]:
pred_dict, num_preds, local_data = extract_preds(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [49]:
local_data

Unnamed: 0,sent_pred_id1,sent_pred_id2,b1,e1,b2,e2,pred1_duration,pred2_duration,pred1_text,pred2_text,pred1_dict_idx,pred2_dict_idx
0,sample_document.txt.output 1 4,sample_document.txt.output 1 8,0.0,1.000000,0.138038,0.973703,4,3,Keep,launched,1,2
1,sample_document.txt.output 1 4,sample_document.txt.output 1 13,0.0,0.772453,0.051323,1.000000,4,3,Keep,was,1,0
2,sample_document.txt.output 1 8,sample_document.txt.output 1 13,0.0,0.683801,0.350981,1.000000,3,3,launched,was,2,0
3,sample_document.txt.output 1 13,sample_document.txt.output 2 4,0.0,0.713067,0.182615,1.000000,3,4,was,was a glaring hole,0,37
4,sample_document.txt.output 1 13,sample_document.txt.output 2 6,0.0,0.772952,0.370259,1.000000,3,3,was,considering,0,38
5,sample_document.txt.output 1 13,sample_document.txt.output 2 11,0.0,0.700707,0.054357,1.000000,3,5,was,has,0,35
6,sample_document.txt.output 1 13,sample_document.txt.output 2 20,0.0,0.721205,0.134588,1.000000,3,4,was,powered,0,36
7,sample_document.txt.output 2 4,sample_document.txt.output 2 6,0.0,0.929649,0.360196,1.000000,4,2,was a glaring hole,considering,37,38
8,sample_document.txt.output 2 4,sample_document.txt.output 2 11,0.0,0.792010,0.049051,1.000000,4,5,was a glaring hole,has,37,35
9,sample_document.txt.output 2 4,sample_document.txt.output 2 20,0.0,0.859625,0.156823,1.000000,4,4,was a glaring hole,powered,37,36


In [74]:
## Run Timeline Model on current docid's data
model = TimelineModel(data = local_data,
         num_preds = num_preds,
        device=torch.device(type="cpu"))

pred_b1, pred_e1, pred_b2, pred_e2, pred_timeline  = model.fit(local_data, epochs=5000)

#### Model Parameters ####
pred_tensor torch.Size([65, 2])
k torch.Size([1])
##########################


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Epoch: 1, Loss: 9.886517524719238

Epoch: 4630, Converging-Loss: 0.8108711242675781


In [75]:
def extract_pred_text(lst, data):
    '''
    Input: A list of sent_pred tokens
    Output: A list of predicate text
    '''
    ans = []
    for sent_pred in lst:
        try:
            pred_text = data[(data.sent_pred_id1==sent_pred)]['pred1_text'].values[0]
            ans.append(pred_text)
        except:
            pred_text = data[(data.sent_pred_id2==sent_pred)]['pred2_text'].values[0]
            ans.append(pred_text)
            
    return ans

preds_arr = local_data[['sent_pred_id1', 'sent_pred_id2']].values
uniq_preds = np.unique(preds_arr.flatten())
#print(uniq_preds)

preds_text = extract_pred_text(uniq_preds, local_data)

ans_df = pd.DataFrame(data=pred_timeline, 
                     columns=['start_pt', 'duration'])
ans_df['sent_pred_id'] = uniq_preds
ans_df['pred_text'] = preds_text

In [77]:
ans_df.head(40)

Unnamed: 0,start_pt,duration,sent_pred_id,pred_text
0,0.1119858,1.695792,sample_document.txt.output 1 13,was
1,0.02022708,1.381148,sample_document.txt.output 1 4,Keep
2,0.0,1.514682,sample_document.txt.output 1 8,launched
3,3.955912,1.8272,sample_document.txt.output 10 12,serves
4,3.002516,2.072986,sample_document.txt.output 10 6,is
5,3.09087,2.587976,sample_document.txt.output 11 2,viewing
6,4.979735,1.893597,sample_document.txt.output 11 21,allow
7,5.718867,2.573961,sample_document.txt.output 11 24,edit
8,3.845518,2.494445,sample_document.txt.output 11 7,tapping
9,1.6908,1.434886,sample_document.txt.output 12 4,is frictionless


In [None]:
ans_df.to_json()