# Find all explicit connective information

Finding all explicit connectives from the gold data (`relations.json`):

In [59]:
import json
from collections import Counter
from nltk.tree import Tree

## Set correct filenames and paths

In [164]:
parsefile = "conll16st-en-01-12-16-trial/parses.json"
relationfile = "conll16st-en-01-12-16-trial/relations.json"

In [None]:
def get_part_of_speech(docID, find_word, span):
    """
    Get PoS from the parses.json file by filename, word and span.
    """
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
        #enumerate to get sentenceID
        sentencelist = enumerate(parse_dict[docID]["sentences"])
        
        for sentenceID, sentence in sentencelist:
            for word_data in sentence["words"]:
                
                word = word_data[0]                
                off_begin = word_data[1]["CharacterOffsetBegin"]
                off_end = word_data[1]["CharacterOffsetEnd"]
                part_of_speech = word_data[1]["PartOfSpeech"]
                
                begin, end = span
                
                if off_begin == begin and off_end == end:
                    return part_of_speech

In [None]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
    sentencelist = parse_dict[docID]["sentences"]
    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    return phrase_structure

In [162]:
def get_connective_dependency(docID, sentenceID, part_of_speech, connective):
    """
    Return heading and attached dependencies of connectives. 
    """

    dependency_heading = "_"
    dependency_attached = "_"
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
    sentencelist = parse_dict[docID]["sentences"]
    
    dependencylist = sentencelist[sentenceID]["dependencies"]
    
    for dependency in dependencylist:
        pos, heading, attached = dependency
        
        heading_token, headingID = heading.split("-")[:2]
        attached_token, attachedID = attached.split("-")[:2]
        
        if attached_token == connective:
            dependency_heading = heading
            
        if heading_token == connective:
            dependency_attached = attached
            
    return dependency_heading, dependency_attached      


Since we are only focussing on Arg1-arguments in the immediate previous sentence from the connective or the same sentence as the connective, this function returns `PS` or `SS` and `None` if the argument is further away.

In [156]:
def PS_or_SS(arg1_sentenceID, sentenceID):
    """
    Returns Arg1 type (PS or SS) based on sentenceIDs.
    """

    if arg1_sentenceID == sentenceID:
        return "SS"
    elif arg1_sentenceID == sentenceID-1:
        return "PS"
    
    

In [171]:
#To find all the connectives and store them in a list
connectivelist = []

#iterate through the file:
with open(relationfile, encoding="utf-8") as infile:

    for relationID, json_line in enumerate(infile):
        relation = json.loads(json_line)
        
        connective = relation["Connective"]["RawText"]
        connective_type = relation["Type"]
        
        #only for explicit connectives
        if connective_type != "Explicit":
            continue
            
        docID = relation["DocID"]       
        
        tokenlist = relation["Connective"]["TokenList"]
        sense = relation["Sense"][0]
        
        connective_extent = relation["Connective"]["CharacterSpanList"][0]
        tokenID = tokenlist[0][2]
        sentenceID = tokenlist[0][3]
        sentence_position = tokenlist[0][4]
        
        arg1 = relation["Arg1"]    
        arg1_extent = arg1["CharacterSpanList"][0]
        arg1_sentenceID = arg1["TokenList"][0][3]
        arg1_type = PS_or_SS(arg1_sentenceID, sentenceID) #get arg1_type
        
        # if Arg1 is of type 'PS', give the phrase structure tree of its sentenceID
        if arg1_type == "PS":
            arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
        else:
            arg1_phrase_structure = None
        
        arg2 = relation["Arg2"]
        arg2_extent = arg2["CharacterSpanList"][0]
        arg2_sentenceID = arg2["TokenList"][0][3]
        
        
        part_of_speech = get_part_of_speech(docID, connective, connective_extent)
        
        phrase_structure = get_phrase_structure(docID, sentenceID)
        
        dependency_heading, dependency_attached = get_connective_dependency(docID, sentenceID, part_of_speech, connective)
        
        connectivelist.append(connective.lower())
        
        
        #OUTPUT, this somewhat resembles the format in the gdocs file
        print(
            docID, #filename
            tokenID, #unique token ID
            sentenceID, #sentenceID
            sentence_position, #position in sentence
            connective, #token
            connective_extent, #extent connective
            part_of_speech, #PoS
#           lemmas
            dependency_heading, #what is the head of the connective
            dependency_attached, #what is attached to the connective
#           phrase_structure, #phrase structure
            arg1_extent, #extent arg1
            arg1_type, #PS or SS
#           arg1_phrase_structure, #arg1 phrase structure
            arg2_extent, #extent arg2
            sense, #meaning
            relationID #discourse relation ID
             )

wsj_1000 160 5 0 But [879, 882] CC moved-13 _ [783, 877] PS [883, 957] Comparison.Contrast 1
wsj_1000 412 13 7 and [2227, 2230] CC work-6 _ [2197, 2225] SS [2231, 2265] Expansion.Conjunction 7
wsj_1000 421 14 0 While [2269, 2274] IN explained-4 _ [2389, 2445] SS [2275, 2386] Comparison.Concession 8
wsj_1000 471 15 14 because [2518, 2525] IN ruining-18 _ [2493, 2517] SS [2526, 2552] Contingency.Cause.Reason 10
wsj_1000 486 16 8 so [2577, 2579] IN said-2 _ [2554, 2573] SS [2580, 2636] Contingency.Cause.Result 12
wsj_1000 502 17 1 also [2648, 2652] RB blasted-3 _ [9, 179] None [2641, 2647] Expansion.Conjunction 13
wsj_1000 558 20 3 also [2939, 2943] RB has-5 _ [2878, 2923] PS [2925, 2938] Expansion.Conjunction 16
wsj_1000 632 22 20 but [3332, 3335] CC is-7 _ [3282, 3330] SS [3336, 3367] Comparison.Contrast 18
wsj_1000 677 24 9 but [3545, 3548] CC disturbs-7 _ [3507, 3543] SS [3549, 3654] Comparison.Contrast 20
wsj_1000 765 28 6 when [4004, 4008] WRB is-10 _ [3974, 4003] SS [4009, 4035] Co

## List of explicit connectives and their frequencies

In [168]:
# connectives and frequencies
connectives = Counter(connectivelist)

connectives

Counter({'also': 2,
         'and': 2,
         'because': 1,
         'but': 4,
         'if then': 1,
         'so': 1,
         'when': 1,
         'while': 1})