# Find all explicit connective information

Finding all explicit connectives from the gold data (`relations.json`):

In [25]:
import json
from collections import Counter
import sklearn
import csv

## Set correct filenames and paths

In [70]:
parsefile = "conll16st-en-01-12-16-dev/parses.json"
relationfile = "conll16st-en-01-12-16-dev/relations.json"

In [53]:
def get_part_of_speech(docID, find_word, span):
    """
    Get PoS from the parses.json file by filename, word and span.
    """
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
        #enumerate to get sentenceID
        sentencelist = enumerate(parse_dict[docID]["sentences"])
        
        for sentenceID, sentence in sentencelist:
            for word_data in sentence["words"]:
                
                word = word_data[0]                
                off_begin = word_data[1]["CharacterOffsetBegin"]
                off_end = word_data[1]["CharacterOffsetEnd"]
                part_of_speech = word_data[1]["PartOfSpeech"]
                
                begin, end = span
                
                if off_begin == begin and off_end == end:
                    return part_of_speech

In [54]:
def get_phrase_structure(docID, sentenceID):
    """
    Retrieve phrase_structure from the parses.json file by filename and sentenceID.
    """
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
    sentencelist = parse_dict[docID]["sentences"]
    
    phrase_structure = sentencelist[sentenceID]["parsetree"]
    
    return phrase_structure

In [55]:
def get_connective_dependency(docID, sentenceID, part_of_speech, connective):
    """
    Return heading and attached dependencies of connectives. 
    """

    dependency_heading = "_"
    dependency_attached = "_"
    
    with open(parsefile, encoding="utf-8") as infile:
        parse_dict = json.load(infile)
        
    sentencelist = parse_dict[docID]["sentences"]
    
    dependencylist = sentencelist[sentenceID]["dependencies"]
    
    for dependency in dependencylist:
        pos, heading, attached = dependency
        
        heading_token, headingID = heading.split("-")[:2]
        attached_token, attachedID = attached.split("-")[:2]
        
        if attached_token == connective:
            dependency_heading = heading
            
        if heading_token == connective:
            dependency_attached = attached
            
    return dependency_heading, dependency_attached      


Since we are only focussing on Arg1-arguments in the immediate previous sentence from the connective or the same sentence as the connective, this function returns `PS` or `SS` and `None` if the argument is further away.

In [56]:
def PS_or_SS(arg1_sentenceID, sentenceID):
    """
    Returns Arg1 type (PS or SS) based on sentenceIDs.
    """

    if arg1_sentenceID == sentenceID:
        return "SS"
    elif arg1_sentenceID == sentenceID-1:
        return "PS"
    
    

## Only for building a list of connectives from the `relations.json` data

In [57]:
#To find all the connectives and store them in a list
connectivelist = []
tuplelist = []

#iterate through the file:
with open(relationfile, encoding="utf-8") as infile:

    for relationID, json_line in enumerate(infile):
        relation = json.loads(json_line)
        
        connective = relation["Connective"]["RawText"]
        connective_type = relation["Type"]
        
        #only for explicit connectives
        if connective_type != "Explicit":
            continue
        
        connectivelist.append(connective.lower())
        

        

## The same, but more

In [None]:
#To find all the connectives and store them in a list
connectivelist = []
tuplelist = []

#iterate through the file:
with open(relationfile, encoding="utf-8") as infile:

    for relationID, json_line in enumerate(infile):
        relation = json.loads(json_line)
        
        connective = relation["Connective"]["RawText"]
        connective_type = relation["Type"]
        
        #only for explicit connectives
        if connective_type != "Explicit":
            continue
            
        docID = relation["DocID"]       
        
        tokenlist = relation["Connective"]["TokenList"]
        sense = relation["Sense"][0]
        
        connective_extent = relation["Connective"]["CharacterSpanList"][0]
        tokenID = tokenlist[0][2]
        sentenceID = tokenlist[0][3]
        sentence_position = tokenlist[0][4]
        
        arg1 = relation["Arg1"]    
        arg1_extent = arg1["CharacterSpanList"][0]
        arg1_sentenceID = arg1["TokenList"][0][3]
        arg1_type = PS_or_SS(arg1_sentenceID, sentenceID) #get arg1_type
        
        # if Arg1 is of type 'PS', give the phrase structure tree of its sentenceID
        if arg1_type == "PS":
            arg1_phrase_structure = get_phrase_structure(docID, arg1_sentenceID)
        else:
            arg1_phrase_structure = None
        
        arg2 = relation["Arg2"]
        arg2_extent = arg2["CharacterSpanList"][0]
        arg2_sentenceID = arg2["TokenList"][0][3]
        
        
        part_of_speech = get_part_of_speech(docID, connective, connective_extent)
        
        phrase_structure = get_phrase_structure(docID, sentenceID)
        
        dependency_heading, dependency_attached = get_connective_dependency(docID, sentenceID, part_of_speech, connective)
        
        connectivelist.append(connective.lower())
        
        
        #OUTPUT, this somewhat resembles the format in the gdocs file
        print(
            docID, #filename
            tokenID, #unique token ID
            sentenceID, #sentenceID
            sentence_position, #position in sentence
            connective, #token
            connective_extent, #extent connective
            part_of_speech, #PoS
            dependency_heading, #what is the head of the connective
            dependency_attached, #what is attached to the connective
#           phrase_structure, #phrase structure
            arg1_extent, #extent arg1
            arg1_type, #PS or SS
#           arg1_phrase_structure, #arg1 phrase structure
            arg2_extent, #extent arg2
            sense, #meaning
            relationID #discourse relation ID
             )

        tuplelist.append(({'connective': connective, 'pos': part_of_speech, 'position': sentence_position}, 1))
        

## List of explicit connectives and their frequencies

In [66]:
# connectives and frequencies
connectives = Counter(connectivelist).most_common(50)

#connectives

list_of_surface_forms = [x for x,y in connectives]

print(list_of_surface_forms)

['but', 'and', 'also', 'if', 'when', 'while', 'because', 'as', 'after', 'however', 'although', 'then', 'before', 'so', 'though', 'meanwhile', 'for example', 'still', 'since', 'in addition', 'until', 'instead', 'thus', 'yet', 'moreover', 'indeed', 'unless', 'even though', 'for instance', 'later', 'once', 'or', 'even if', 'in fact', 'as a result', 'separately', 'previously', 'if then', 'finally', 'nevertheless', 'on the other hand', 'in turn', 'nor', 'by contrast', 'otherwise', 'nonetheless', 'so that', 'therefore', 'as long as', 'now that']


In [46]:
tuplelist

[({'connective': 'unless', 'pos': 'IN', 'position': 12}, 1),
 ({'connective': 'But', 'pos': 'CC', 'position': 0}, 1),
 ({'connective': 'also', 'pos': 'RB', 'position': 3}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 20}, 1),
 ({'connective': 'as', 'pos': 'IN', 'position': 6}, 1),
 ({'connective': 'and', 'pos': 'CC', 'position': 14}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 18}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 6}, 1),
 ({'connective': 'before', 'pos': 'IN', 'position': 13}, 1),
 ({'connective': 'Moreover', 'pos': 'RB', 'position': 0}, 1),
 ({'connective': 'but', 'pos': 'CC', 'position': 16}, 1),
 ({'connective': 'after', 'pos': 'IN', 'position': 21}, 1),
 ({'connective': 'But', 'pos': 'CC', 'position': 0}, 1),
 ({'connective': 'and', 'pos': 'CC', 'position': 20}, 1),
 ({'connective': 'if', 'pos': 'IN', 'position': 17}, 1),
 ({'connective': 'when', 'pos': 'WRB', 'position': 15}, 1),
 ({'connective': 'for example', 'pos': None, 'position': 2

# NLTK

In [68]:
from nltk.classify import maxent

# train = [
#     ({'a': 1, 'b': 1, 'c': 1}, 'y'),
#     ({'a': 5, 'b': 5, 'c': 5}, 'x'),
#     ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'),
#     ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'),
#     ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'),
#     ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x')
# ]

train = [({'connective': 'unless', 'pos': 'IN', 'position': 12, 'bla': [1,2,3]}, 1),
 ({'connective': 'But', 'pos': 'CC', 'position': 0, 'bla': [1,3]}, 1),
 ({'connective': 'also', 'pos': 'RB', 'position': 3, 'bla': [1,2,3]}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 20, 'bla': [1,2,3]}, 1),
 ({'connective': 'as', 'pos': 'IN', 'position': 6, 'bla': [1,2,3]}, 1),
 ({'connective': 'and', 'pos': 'CC', 'position': 14, 'bla': [1,2,3]}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 18, 'bla': [1,2,3]}, 1),
 ({'connective': 'until', 'pos': 'IN', 'position': 6, 'bla': [1,2,3]}, 1),
 ({'connective': 'before', 'pos': 'IN', 'position': 13, 'bla': [1,2,3]}, 1),
 ({'connective': 'Moreover', 'pos': 'RB', 'position': 0, 'bla': [1,2,3]}, 1),
 ({'connective': 'Test', 'pos': 'XX', 'position': 90, 'bla': [1,2,3,6,7]}, 0)
        ]

# test = [
#     {'a': 5.2, 'b': 5.1, 'c': 5},
#     {'a': 1, 'b': 0.8, 'c': 1.2},
#     {'a': 5.2, 'b': 5.1, 'c': 5}
#     ]

test =  [
    {'connective': 'until', 'pos': 'IN', 'position': 18},
    {'connective': 'until', 'pos': 'IN', 'position': 6},
    {'connective': 'before', 'pos': 'IN', 'position': 13},
    {'connective': 'Moreover', 'pos': 'RB', 'position': 0},
    {'connective': 'but', 'pos': 'CC', 'position': 16},
    {'connective': 'after', 'pos': 'IN', 'position': 21},
    {'connective': 'Testing', 'pos': 'XXX', 'position': 90}
  ]

encoding = maxent.TypedMaxentFeatureEncoding.train(
    train, count_cutoff=3, alwayson_features=True)

classifier = maxent.MaxentClassifier.train(
    train, bernoulli=False, encoding=encoding, trace=0)

classifier.classify_many(test)

TypeError: unhashable type: 'list'

In [64]:
# Getting the data for the connectives classifier

list_of_surface_forms = ['but', 'and', 'also', 'if', 'when', 'while', 'because', 'as', 'after', 'however', 'although', 'then', 'before', 'so', 'though', 'meanwhile', 'for example', 'still', 'since', 'in addition', 'until', 'instead', 'thus', 'yet', 'moreover', 'indeed', 'unless', 'even though', 'for instance', 'later', 'once', 'or', 'even if', 'in fact', 'as a result', 'separately', 'previously', 'if then', 'finally', 'nevertheless', 'on the other hand', 'in turn', 'nor', 'by contrast', 'otherwise', 'nonetheless', 'so that', 'therefore', 'as long as', 'now that']

