# DataPreprocessor

This notebook prepares the input data for our project. We received the sense predictions for each parser of the ConLL challenge of 2016 from Te Rutherford (special thanks to him). Unfortunately, the arg1 and arg2 spans of all parsers are the same, so we had to randomize them to generate a "real" parser output. Therefore, we changed in some of the predicted relations connectives, types, arg1 or/and arg2 spans. Moreover, we added additional attributes that identifies the changes and parser names.

In [5]:
import json
import random
from read_write_files import read_json,save_json,get_parser_paths

In [7]:
def add_compare_attr(relation,parser_name="None"):
    # To compare each parser, we add new attributes to the relation dictionaries

    # param relation   Relation of the parser
    # param parser_name Name of the Parser
    
    # return Relation with Connective RawText if not present, 
    # "Change" (if we create a randomized relation we can track which changes were made to the relation) 
    # and "Parser" (parser_name) attribute
    
    new_rel = relation.copy()
    conn = new_rel["Connective"]
    if not conn.has_key("RawText"):
        new_rel["Connective"]["RawText"] = ""
        
    new_rel["Change"] = "None"
    new_rel["Parser"] = parser_name
    
    return new_rel

In [11]:
class Preprocessor:
    # Class for preprocessing the relations e.g by adding additional attributes or randomize relations to 
    # create simulated parser outputs (because we only had access to the parser outputs for the sense challenge)
    
    
    def __init__(self,parsers,gold_path):
        
        self.parser_preds = []
        for name,path in parsers:
            self.parser_preds += [{
                "parser":name,
                "relations": [add_compare_attr(rel,parser_name=name) for rel in read_json(path)]
            }]
        
        self.gold_rel = [add_compare_attr(rel,parser_name="Gold") for rel in read_json(gold_path)]
        self.get_unique_attr(self.gold_rel)
    
    
    def get_unique_attr(self,relations):
        # Get the unique values for connectives, parser_names and types of relations

        # param relations   Relations of the parser
        
        # return class variables with the unique values
        
        self.unique_connect = set()
        self.unique_sense = set()
        self.unique_type = set()
        
        for rel_dic in relations:
            self.unique_connect.update([rel_dic["Connective"]["RawText"]])
            self.unique_sense.update(rel_dic["Sense"])
            self.unique_type.update([rel_dic["Type"]])
            
        self.unique_connect = self.unique_connect.difference([""])
        
    def randomize_rel(self,relation,change):
        # Change a specific value of the relation 

        # param relation   Relation of a parser
        # param change   which attribute should be changed
        
        # return relation with one changed attribute
        relation["Change"] = change
        
        if change == "Arg1" or change == "Arg2":
            numb_add_tok = random.randint(0,10)
            last_tok = relation[change]["TokenList"][-1]
            add_tok = [i for i in range(last_tok+1,last_tok+numb_add_tok)]
            relation[change]["TokenList"] += add_tok
        elif change == "Args":
            relation = self.randomize_rel(relation,"Arg1")
            relation = self.randomize_rel(relation,"Arg2")
        elif change == "Type":
            if relation[change] == "Explicit":
                types = self.unique_type.copy()
                types = types.difference(["Explicit"])
                relation[change] = random.sample(types,1)[0]
                relation["Connective"]["RawText"] = ""
                relation["Connective"]["TokenList"] = []
            else:
                types = self.unique_type.copy()
                types = types.difference(["Explicit"])
                relation[change] = random.sample(types,1)[0]
                if relation[change] == "Explicit":
                    relation = self.change_connective(relation)
                        
        elif change == "Connective":
            relation = self.change_connective(relation)
        
        return relation
    
    def change_connective(self,relation):
        arg1 = relation["Arg1"]["TokenList"]
        arg2 = relation["Arg2"]["TokenList"]
        args = arg1+arg2
        relation["Connective"]["RawText"] = random.sample(self.unique_connect,1)[0]
        relation["Connective"]["TokenList"] = [random.randint(min(args),max(args))]
        
        return relation
    
    def create_randomized_parser_output(self,parser_name):
        # Create a randomized parser output, that should behave like a real parser prediction

        # param parser_name   name of a parser
        
        # return class variables with the unique values
        parser_rel = [
            parser_pred["relations"] 
            for parser_pred in self.parser_preds 
            if parser_pred["parser"] == parser_name][0]
        
        select_len = int(len(parser_rel)*(random.randrange(80,100,5))/100)
        selected_rel = random.sample(parser_rel,select_len)[:]
            
        
        change_len = int(select_len*0.6)
        change_rel_ind = random.sample(range(change_len),change_len)
        
        changes = ["Arg1","Arg2","Args","Type","Connective"] 
        for ind,rel in enumerate(selected_rel):
            if ind in change_rel_ind:
                change = random.sample(changes,1)[0]
                rel = self.randomize_rel(rel,change)
                
        return selected_rel
        

## Test of Randomization

This shows the process of radnomization, which we use to generate real parser outputs due to the absence of real data for arg1/arg2 and connectives. Nevertheless, the senses are originally from the conll2016 challenge.

In [100]:
changes = ["Arg1","Arg2","Args","Type","Connective"] 
changes = ["Type","Connective"] 
change = random.sample(changes,1)[0]
print(preprocessor.parser_rel[0])
print(change)
after = randomizer.randomize_rel(randomizer.parser_rel[0].copy(),change)
print(after)

{'Arg2TokenList': [42, 43, 44, 45, 46, 47, 48, 49, 50, 51], 'DocID': u'wikinews_101184', 'Parser': 'Pred', 'Connective': {u'RawText': '', u'TokenList': []}, 'Arg1TokenList': [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'Sense': [u'Temporal.Asynchronous.Succession'], 'Type': u'Explicit'}
Connective
Connective
{'Arg2TokenList': [42, 43, 44, 45, 46, 47, 48, 49, 50, 51], 'Arg1TokenList': [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'DocID': u'wikinews_101184', 'Sense': [u'Temporal.Asynchronous.Succession'], 'Parser': 'Pred', 'Type': u'Explicit', 'Connective': {u'RawText': u'As', u'TokenList': [36]}}


## Save (randomized) Training Sets

We tokk

In [2]:
gold_path = "data/gold_standard/test/relations.json"

In [3]:
path = "data/submissions/sense_only/test/"
parsers = get_parser_paths(path)

In [13]:
preprocessor = Preprocessor(parsers,gold_path)
for name,pars_path in parsers: 
    rels = preprocessor.create_randomized_parser_output(name)
    save_json(rels,"data/submissions/randomized/test/"+name+".json")

In [136]:
save_parser_rel(preprocessor.gold_rel,"data/gold_standard/test/gold.json")

## Save Test Sets

In [21]:
gold_test_path = "data/gold_standard/blind/relations.json"

In [22]:
path = "data/submissions/sense_only/blind/"
parsers = get_parser_paths(path)

In [23]:
preprocessor = Preprocessor(parsers,gold_test_path)
for name,pars_path in parsers: 
    rels = preprocessor.create_randomized_parser_output(name)
    save_json(rels,"data/submissions/randomized/blind/"+name+".json")

In [24]:
save_json(preprocessor.gold_rel,"data/gold_standard/blind/gold.json")