# Добавление ролей в исходный набор данных

In [None]:
import csv

with open('train.csv','r') as csvin, open('train.tsv', 'w') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)

In [None]:
with open('dev.csv','r') as csvin, open('dev.tsv', 'w') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)

In [1]:
import numpy as np
train_framebank = np.load('train_framebank.npy', allow_pickle=True)
dev_framebank = np.load('dev_framebank.npy', allow_pickle=True)

In [4]:
def print_roles(lemma, role_annot, f):
    for sent_num, ann_sent in enumerate(role_annot):
      # print('sent_num: {}'.format(sent_num))
      # print('ann_sent: {}'.format(ann_sent))
        for event in ann_sent:
            f.write('=====Pred: {}\n'.format(lemma[sent_num][event.pred[0]]))
            for arg in event.args:
                f.write('Arg({}): {}\n'.format(arg.tag, lemma[sent_num][arg.begin]))

In [6]:
from tqdm import tqdm

f = open('train_roles.txt', 'w')

for temp in tqdm(train_framebank):
  context = temp['context']
  question = temp['question']
  f.write(context['text'] +'\n')
  for t in context['lemma']:
    for t1 in t:
      f.write(t1 + ' ')
  f.write('\n')
  print_roles(context['lemma'], context['srl'], f)
  f.write('\n')
  f.write(question['text'] + '\n')
  for t in question['lemma']:
    for t1 in t:
      f.write(t1 + ' ')
  f.write('\n')   
  print_roles(question['lemma'], question['srl'], f)
  f.write('\n')

100%|██████████| 45328/45328 [00:02<00:00, 15582.48it/s]


In [None]:
f = open('dev_roles.txt', 'w')

for temp in tqdm(dev_framebank):
  context = temp['context']
  question = temp['question']
  f.write(context['text'] +'\n')
  for t in context['lemma']:
    for t1 in t:
      f.write(t1 + ' ')
  f.write('\n')
  print_roles(context['lemma'], context['srl'], f)
  f.write('\n')
  f.write(question['text'] + '\n')
  for t in question['lemma']:
    for t1 in t:
      f.write(t1 + ' ')
  f.write('\n')   
  print_roles(question['lemma'], question['srl'], f)
  f.write('\n')

In [7]:
import re
import numpy
import csv

framebank_file = 'train_roles.txt'
# framebank_file = 'dev_roles.txt'

def substring_after(s, delim):
    return s.partition(delim)[2]

def create_roles_dictionary(framebank_file=framebank_file):
    with open(framebank_file, encoding="utf8") as fbFile:
        # Удалить пробелы и переносы строк в выделенном pred/arg
        regexp = "^\s+|\n|\r|\s+$"
        pred_delimeter = '=====Pred: '
        arg_delim = ': '
        arg_role_start_char = '('
        arg_role_end_char = ')'

        current_line = 1 
        # строка с предложением которое анализируем
        current_phrase = ''
        # выделененные после анализа роли
        roles = ""
        phrases_roles = {}

        tokenized_phr_line = 2
        roles_in_phrase_map = {}
        phrases_tokenized_ph = {}
        roles_in = {}
        
        pred_cnt = 1
        pred_num = 0
        for num, line in enumerate(fbFile, 1):
            if num == current_line:
                if pred_delimeter in line:
                    if pred_cnt == 1 and pred_num > 0:
                      roles_in[pred_num] =  roles_in_phrase_map
                      pred_cnt = 0
                      roles_in_phrase_map = {}

                    pred = re.sub(regexp, '', substring_after(line, pred_delimeter))
                    roles_in_phrase_map[pred] = '<pred>'
                    # print('pred: ', pred)
                    pred_num += 1
                    pred_cnt = 1
                    current_line += 1
                    continue
                elif 'Arg(' in line:
                    arg = re.sub(regexp, '', substring_after(line, arg_delim))
                    arg_role = line[line.find(arg_role_start_char) + 1 : line.find(arg_role_end_char)]
                    roles += arg_role + ','
                    roles_in_phrase_map[arg] = arg_role
                    # print('ARG: ', arg)
                    # print('ARG role: ', arg_role)
                    current_line += 1
                    continue          
                elif line == '\n':
                    roles_in[pred_num] =  roles_in_phrase_map
                    phrases_roles[current_phrase] = roles_in                    
                    roles_in_phrase_map = {}
                    roles_in = {}
                    pred_cnt = 0
                    pred_num = 0
                    current_line += 1
                    continue

                tokenized_phr_line = current_line + 1
                current_line += 2
                current_phrase = line.replace("\n","")

            if num == tokenized_phr_line:
                phrases_tokenized_ph[current_phrase] = line.replace("\n","")

    return phrases_roles, phrases_tokenized_ph

In [8]:
roles_dictionary, tokenized_ph_dictionary = create_roles_dictionary()

def align_tokens_lenght(tokenized_phrase, phrase_roles):
    tokens = tokenized_phrase.split()
    roles_tokens = []

    for token in tokens:
      temp = 1
      for phrases in phrase_roles:
          if token in phrase_roles[phrases]:
              roles_tokens.append(phrase_roles[phrases][token])
              del phrase_roles[phrases][token]
              temp = 0
              break

      if temp == 1:
        roles_tokens.append('<unk>')
    return roles_tokens


def read_tsv_and_find_roles(file, writer):
    file_reader = csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
    line_count = 0

    for num, row in enumerate(file_reader):
        if line_count == 0:
            # print(f'Column names are {" ".join(row)}')
            line_count += 1

        ph1 = row["context"]
        ph2 = row["question"]
        
        try:
            tokenized_ph1 = tokenized_ph_dictionary[ph1]
        except KeyError as e:
            print(num)
            # print(e)
            # print(ph1[len(ph1) - 1:])
            # print(ph1[:-1])
            # print(ph1[len(ph1) - 1:] == ' ' or ph1[len(ph1) - 1:] == '\t')
            if ph1[len(ph1) - 1:] == ' ' or ph1[len(ph1) - 1:] == '\t':
                tokenized_ph1 = tokenized_ph_dictionary[ph1[:-1]]            
            else:
                continue

        try:
            tokenized_ph2 = tokenized_ph_dictionary[ph2]
        except KeyError as e:
            # print(e)
            # print(ph2[len(ph2) - 1:])
            # print(ph2[:-1])
            # print(ph2[len(ph2) - 1:] == ' ' or ph2[len(ph2) - 1:] == '\t')
            if ph2[len(ph2) - 1:] == ' ' or ph2[len(ph2) - 1:] == '\t':
                tokenized_ph2 = tokenized_ph_dictionary[ph2[:-1]]            
            else:
                continue

        roles1 = align_tokens_lenght(tokenized_ph1, roles_dictionary[ph1])
        try:
          roles2 = align_tokens_lenght(tokenized_ph2, roles_dictionary[ph2])        
        except KeyError as e:
          print(e)
          continue
        # ','.join() - convert roles_tokens list to string with ',' separator
        writer.writerow({
            'context': tokenized_ph1, 
            'context_roles': ','.join(roles1),
            'question': tokenized_ph2,
            'question_roles': ','.join(roles2), 
            'answer': row["answer"],
            'answer_start': row["answer_start"]
        })

        line_count += 1

In [9]:
test_file = 'train.tsv'
output_file = 'train_roles.tsv'

def convert_tsv_data_to_with_role(train_file=test_file, output_file=output_file):
    with open(output_file, mode='w', encoding='utf-8', newline='') as output_file:
        fieldnames = ['context','context_roles','question','question_roles','answer', 'answer_start']
        delimiter = '\t'
        quotechar = ''        
        quoting = quoting=csv.QUOTE_NONE
        output_writer = csv.DictWriter(output_file, fieldnames=fieldnames, delimiter=delimiter, quoting=quoting, quotechar=quotechar, escapechar='')
        output_writer.writeheader()

        with open(train_file, mode='r', encoding='utf-8') as tsv_file:
            read_tsv_and_find_roles(tsv_file,output_writer)

convert_tsv_data_to_with_role()

2479
3874
22051
27159
28149
33954
35228
37003


In [None]:
test_file = 'dev.tsv'
output_file = 'dev_roles.tsv'

def convert_tsv_data_to_with_role(train_file=test_file, output_file=output_file):
    with open(output_file, mode='w', encoding='utf-8', newline='') as output_file:
        fieldnames = ['context','context_roles','question','question_roles','answer', 'answer_start']
        delimiter = '\t'
        quotechar = ''        
        quoting = quoting=csv.QUOTE_NONE
        output_writer = csv.DictWriter(output_file, fieldnames=fieldnames, delimiter=delimiter, quoting=quoting, quotechar=quotechar, escapechar='')
        output_writer.writeheader()

        with open(train_file, mode='r', encoding='utf-8') as tsv_file:
            read_tsv_and_find_roles(tsv_file,output_writer)

convert_tsv_data_to_with_role()