In [20]:
import json
import os
import pandas as pd
import pickle as pkl

In [39]:
DEEPBANK_PATH = './deepbank1.1/'
PENN_TREEBANK_PATH = './ptb/nw/wsj'
SEMLINK_FILE = './semlink/other_resources/1.2.2c.okay'
OUT_MAPPINGS_FILE = './sl_mappings.csv'
OUT_SENTENCES_FILE = './sentences.csv'
OUT_TREE_FILE = './trees.csv'

In [3]:
with open(SEMLINK_FILE) as f:
    file = f.readlines()
len(file)

74977

In [34]:
mappings = []
with open(SEMLINK_FILE) as f:
    file = f.readlines()
for line in file:
    contents = line.split()
    sem_map = {}
    sem_map['section_id'] = str(int(contents[0].split('/')[-2]))
    sem_map['doc_id'] = str(int(contents[0].split('/')[-1].split('_')[1].split('.')[0]))
    sem_map['sentence_id'] = str(int(contents[1]))
    sem_map['token_id'] = str(int(contents[2]))
    sem_map['is_gold'] = contents[3]
    sem_map['vb_form'] = contents[4]
    sem_map['vn_class_index'] = contents[5]
    sem_map['fn_frame'] = contents[6]
    sem_map['pb_sense'] = contents[7]
    sem_map['not_sure_0'] = contents[8]
    sem_map['not_sure_1'] = contents[9]
    sem_map['augmentations'] = contents[10:]
    
    mappings.append(sem_map)

In [35]:
pd.DataFrame(mappings).sort_values(by=['section_id', 'doc_id', 'sentence_id']).reset_index(drop=True).to_csv(OUT_MAPPINGS_FILE, index=False)

In [2]:
pb_vn_mappings = json.load(open("semlink/instances/pb-vn2.json"))
vn_fn_mappings = json.load(open("semlink/instances/vn-fn2.json"))

In [4]:
len(pb_vn_mappings)

4177

In [5]:
len(vn_fn_mappings)

1681

In [17]:
vn_fn_mappings['26.1-1-make']

['Cooking_creation',
 'Intentionally_create',
 'Building',
 'Cause_change',
 'Manufacturing']

In [19]:
pb_vn_mappings['recognize.01']

KeyError: 'recognize.01'

In [1]:
with open('./deepbank1.1/wsj00a/20001001') as f:
    file = f.read()

In [32]:
file.split('\n\n')[8]

'{\n  proper_q<0:28> BV named<7:14>(Vinken)  \n  compound<0:14> ARG1 named<7:14>(Vinken)  \n  compound<0:14> ARG2 named<0:6>(Pierre)  \n  proper_q<0:6> BV named<0:6>(Pierre)  \n  measure<15:23> ARG1 _old_a_1<24:28>  \n  measure<15:23> ARG2 _year_n_1<18:23>  \n  udef_q<15:23> BV _year_n_1<18:23>  \n  card<15:17>(61) ARG1 _year_n_1<18:23>  \n  _old_a_1<24:28> ARG1 named<7:14>(Vinken)  \n  _join_v_1<34:38> ARG1 named<7:14>(Vinken)  \n  _join_v_1<34:38> ARG2 _board_n_of<43:48>  \n  _the_q<39:42> BV _board_n_of<43:48>  \n  _as_p<49:51> ARG1 _join_v_1<34:38>  \n  _as_p<49:51> ARG2 _director_n_of<67:75>  \n  _a_q<52:53> BV _director_n_of<67:75>  \n  _nonexecutive/jj_u_unknown<54:66> ARG1 _director_n_of<67:75>  \n  loc_nonsp<76:84> ARG1 _join_v_1<34:38>  \n  loc_nonsp<76:84> ARG2 dofm<81:84>(29)  \n  def_explicit_q<76:80> BV dofm<81:84>(29)  \n  of_p<76:80> ARG1 dofm<81:84>(29)  \n  of_p<76:80> ARG2 mofy<76:80>(Nov)  \n  def_implicit_q<76:80> BV mofy<76:80>(Nov)  \n  compound<0:14> SF prop  \n

In [11]:
os.listdir('./deepbank1.1/')

<function list.sort(*, key=None, reverse=False)>

In [36]:
# sentence id start with 1 here, 0 in okay
docs = []

for folder_name in os.listdir(DEEPBANK_PATH):
    if folder_name[0] != '.':
        for file_name in os.listdir(os.path.join(DEEPBANK_PATH, folder_name)):
            if file_name[0] != '.':
                # print(file_name)
                with open(os.path.join(DEEPBANK_PATH, folder_name, file_name), 'r') as f: # open in readonly mode
                    contents = f.read().split('\n\n')
                    doc = {}
                    doc['section_id'] = str(int(folder_name[3:5]))
                    doc['doc_id'] = str(int(file_name[1:5]))
                    doc['sentence_id'] = str(int(file_name[5:]))
                    doc['sentence'] = contents[1].split('`')[1].split('\'')[0]
                    doc['constituency'] = contents[5]
                    doc['eds'] = contents[7]
                    doc['dependency'] = contents[8]
                    
                    docs.append(doc)

In [37]:
pd.DataFrame(docs).sort_values(by=['section_id', 'doc_id', 'sentence_id']).reset_index(drop=True).to_csv(OUT_SENTENCES_FILE, index=False)

In [30]:
trees = []

for section_id in os.listdir(PENN_TREEBANK_PATH):
    for file_name in os.listdir(os.path.join(PENN_TREEBANK_PATH, section_id)):
        with open(os.path.join(PENN_TREEBANK_PATH, section_id, file_name), 'r') as f: # open in readonly mode
            contents = f.read().split('\n\n')
            for t, i in zip(contents, range(len(contents))):
                my_dict = {}
                my_dict['section_id'] = str(int(section_id))
                my_dict['doc_id'] = str(int(file_name.split('.')[0][-2:]))
                my_dict['sentence_id'] = str(int(i))
                my_dict['tree'] = t
                
                trees.append(my_dict)


In [41]:
pd.DataFrame(trees).sort_values(by=['section_id', 'doc_id', 'sentence_id']).reset_index(drop=True).to_csv(OUT_TREE_FILE, index=False)