In [1]:
from pathlib import Path
import csv
import re

In [2]:
def normalize_pred_multi_value_feats(feats: str) -> str:
    p = re.compile(r'([^=]+=)(.),(.)')
    return p.sub(r'\1\2|\1\3', feats)

In [3]:
def normalize_pred_toinfinitive(tag: str, feats: str) -> (str, str):
    p = re.compile('\|?tense=TOINFINITIVE')
    new_feats = p.sub('', feats)
    if feats != new_feats:
        tag = f'{tag}_TOINFINITIVE'
#         if not new_feats:
#             new_feats = '_'
        new_feats = '_'
    return tag, new_feats

In [4]:
def normalize_pred_noun_suffix(tag: str, feats: str) -> (str, str):
    if 'suf_' in feats:
        tag = f'{tag}_S_PP'
    return tag, feats

In [5]:
def format_pred_row(entry: dict, cur_sent_id: int, normalize: bool) -> (list, int):
    sent_id = int(entry['sent_id'])
    if sent_id != cur_sent_id:
        return [], sent_id
    from_id, to_id, form, lemma, ctag, tag, feats, token_id = entry['from_node_id'], entry['to_node_id'], entry['form'], entry['lemma'], entry['tag'], entry['tag'], entry['feats'], entry['token_id']
    if normalize:
        feats = normalize_pred_multi_value_feats(feats)
        tag, feats = normalize_pred_toinfinitive(tag, feats)
        tag, feats = normalize_pred_noun_suffix(tag, feats)
    return [from_id, to_id, form, lemma, ctag, tag, feats, token_id], sent_id

In [6]:
def pred_to_lattice(file_path: Path, normalize: bool) -> list:
    csv_reader = csv.DictReader(open(str(file_path)))
    csv_rows = [row for row in csv_reader]
    sent_id = 1
    lattice_rows = []
    for row in csv_rows:
        lattice, sent_id = format_pred_row(row, sent_id, normalize)
        lattice_rows.append(lattice)
        if not lattice:
            lattice, sent_id = format_pred_row(row, sent_id, normalize)
            lattice_rows.append(lattice)
    return lattice_rows

In [7]:
def save_pred_lattices(input_file_path: Path, output_file_path: Path, normalize: bool):
    pred_lattice_rows = pred_to_lattice(input_file_path, normalize)
    with open(output_file_path, 'w') as f:
        for row in pred_lattice_rows:
            f.write('\t'.join(row))
            f.write('\n')

In [8]:
def save_pred_dir(dir_path: Path, norm_type: str):
    normalize = norm_type == 'dep'
    for data_type in ['test', 'dev']:
        pred_file_path = dir_path / f'{data_type}_samples.csv'
        pred_lattice_file_path = dir_path / f'{data_type}-hebtb-pred-{norm_type}.lattices'
        save_pred_lattices(pred_file_path, pred_lattice_file_path, normalize=normalize)

In [9]:
def normalize_gold_multi_value_feats(feats: str, feat_name: str) -> str:
    p = re.compile(f'{feat_name}=.\|{feat_name}=.')
    m = p.search(feats)
    if m:
        mstr = feats[m.start(): m.end()]
        v1 = mstr[4]
        v2 = mstr[-1]
        vnew = ','.join(sorted([v1, v2]))
        feats = p.sub(f'{feat_name}={vnew}', feats)
    return feats

In [10]:
def normalize_gold_toinfinitive(tag: str, feats: str) -> (str, str):
    if '_TOINFINITIVE' in tag:
        tag = tag[:-len('_TOINFINITIVE')]
        if feats == '_':
            feats = 'tense=TOINFINITIVE'
        else:
            feats = f'{feats}|tense=TOINFINITIVE'
    return tag, feats

In [11]:
def normalize_gold_noun_suffix(tag: str, feats: str) -> (str, str):
    if '_S_PP' in tag:
        tag = tag[:-len('_S_PP')]
    return tag, feats

In [12]:
def format_gold_row(row: list, normalize: bool) -> list:
    from_id, to_id, form, lemma, ctag, tag, feats, token_id = row
    if normalize:
        feats = normalize_gold_multi_value_feats(feats, 'gen')
        feats = normalize_gold_multi_value_feats(feats, 'num')
        tag, feats = normalize_gold_toinfinitive(tag, feats)
        tag, feats = normalize_gold_noun_suffix(tag, feats)
    return [from_id, to_id, form, lemma, ctag, tag, feats, token_id]

In [13]:
def gold_to_lattice(file_path: Path, normalize: bool) -> list:
    tsv_reader = csv.reader(open(str(file_path)), delimiter='\t', quotechar='|')
    rows = []
    for row in tsv_reader:
        if row:
            row = format_gold_row(row, normalize)
        rows.append(row)
    return rows

In [14]:
def save_gold_lattices(input_file_path: Path, output_file_path: Path, normalize: bool):
    gold_lattice_rows = gold_to_lattice(input_file_path, normalize)
    with open(output_file_path, 'w') as f:
        for row in gold_lattice_rows:
            f.write('\t'.join(row))
            f.write('\n')

In [15]:
def save_norm_gold_dir(dir_path: Path):
    gold_test_file_path = dir_path / 'test.hebtb.lgold.lattices'
    gold_test_lattice_file_path = dir_path / 'test-hebtb-gold-morph.lattices'
    save_gold_lattices(gold_test_file_path, gold_test_lattice_file_path, normalize=True)
    gold_dev_file_path = dir_path / 'dev.hebtb.lgold.lattices'
    gold_dev_lattice_file_path = dir_path / 'dev-hebtb-gold-morph.lattices'
    save_gold_lattices(gold_dev_file_path, gold_dev_lattice_file_path, normalize=True)

In [16]:
# bert_type = 'basic'
bert_type = 'small'
# bert_model = f'bert-{bert_type}-wordpiece-owt-52000-10'
bert_model = f'bert-{bert_type}-wordpiece-oscar-52000-10'
# bert_model = 'mbert'
# bert_model = 'hebert'
treebank = 'HebrewTreebank'
tb = 'hebtb'
root_dir = Path('experiments/morph')
for pred_type in ['seg_only', 'seg_tag', 'seg_tag_feats']:
    pred_dir_path = root_dir / f'morph_{pred_type}' / 'bert' / bert_type / 'wordpiece' / bert_model / treebank / tb
    save_pred_dir(pred_dir_path, norm_type='morph')
    if pred_type == 'seg_tag_feats':
        save_pred_dir(pred_dir_path, norm_type='dep')

In [17]:
# gold_dir_path = Path('/Users/Amit/dev/onlplab/yapproj/data/hebtb')
# save_norm_gold_dir(gold_dir_path)