In [7]:
import json

# read the dataset
# please enter the path of your data
split = 'test'
data_path = 'ALL/jsonl/' + split + '.jsonl'
data = []
with open(data_path) as f:
    for line in f:
        data.append(json.loads(line))
n_meetings = len(data)
print('Total {} meetings in the {} set.'.format(n_meetings, split))

Total 35 meetings in the test set.


In [2]:
from nltk import word_tokenize
# tokneize a sent
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

In [3]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound } ', '')
    text = text.replace('{ disfmarker } ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause } ', '')
    text = text.replace('{ nonvocalsound } ', '')
    text = text.replace('{ gap } ', '')
    return text

In [8]:
# process data for BART
# the input of the model here is the entire content of the meeting
bart_data = []
for i in range(len(data)):
    # get meeting content
    src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        cur_turn = '<turn_seperator>' + cur_turn
        src.append(cur_turn)
    src = ' '.join(src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data), split))
print(bart_data[2])
with open('processed_data_turn_split/' + split + '.jsonl', 'w') as f:
    for i in range(len(bart_data)):
        print(json.dumps(bart_data[i]), file=f)

Total 281 query-summary pairs in the test set
{'src': "<s> what did barry hughes think about the legal framework when talking about the efficacy of the law ? </s> <turn_seperator>lynne neagle am: good afternoon , everyone . welcome to the children , young people and education committee . we 've received apologies for absence from hefin david and jack sargeant . vikki howells is substituting for jack sargeant . so , vikki , welcome ; it 's good to see you in the committee . item 2 this afternoon is our eleventh evidence session on the children ( abolition of defence of reasonable punishment ) ( wales ) bill . i 'm very pleased to welcome barry hughes , who is chief crown prosecutor for wales ; kwame biney , who is senior policy advisor , cps ; and iwan jenkins , who is head of the complex casework unit , crown prosecution service cymru wales . so thank you all for attending this afternoon . we 're really looking forward to hearing your views on the bill . if you 're happy , we 'll go st

In [22]:
# process data for BART
# the input of the model here is the gold span corresponding to each query
bart_data_gold = []
for i in range(len(data)):
    # get meeting content
    entire_src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        entire_src.append(cur_turn)
    entire_src = ' '.join(entire_src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + entire_src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        src = []
        # get the content in the gold span for each query
        for span in data[i]['specific_query_list'][j]['relevant_text_span']:
            assert len(span) == 2
            st, ed = int(span[0]), int(span[1])
            for k in range(st, ed + 1):
                cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
                cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
                src.append(cur_turn)
        src = ' '.join(src)
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data_gold), split))
print(bart_data_gold[2])
with open('bart_gold/' + split + '._gold.jsonl', 'w') as f:
    for i in range(len(bart_data_gold)):
        print(json.dumps(bart_data_gold[i]), file=f)

Total 281 query-summary pairs in the test set
{'src': "<s> what did barry hughes think about the legal framework when talking about the efficacy of the law ? </s> barry hughes: perfectly happy . sian gwenllian am: thank you very much . i would like to start just by looking in general at how the law currently stands , and how do you think the law as it currently stands today , and specifically in terms of reasonable punishment—how does that protect children . barry hughes: sorry , can i just be clear ? how does the law as it presently stands protect children ? sian gwenllian am: yes . barry hughes: we have a range of offences created by the criminal law , going back to the offences against the person act 1861 in the middle of the century before last , which provide for offences of assault against a variety of people , including , in particular , acts such as the children and young persons act 1933 , which provides for offences that are specific to children . but the more general crimina