In [None]:
!tar -xvf ./data/hebtb.tar.gz

In [None]:
!pwd

In [1]:
import pandas as pd
import numpy as np
import csv
from tqdm import trange

## Files 

In [2]:
filepath_spmrl_dev = './data/spmrl-treebank/dev_hebtb-gold.conll'
filepath_spmrl_train = './data/spmrl-treebank/train_hebtb-gold.conll'
filepath_spmrl_test = './data/spmrl-treebank/test_hebtb-gold.conll'

filepath_ud_dev = './data/ud-treebank/he_htb-ud-dev.conllu'
filepath_ud_train = './data/ud-treebank/he_htb-ud-train.conllu'
filepath_ud_test = './data/ud-treebank/he_htb-ud-test.conllu'


## DF Preparation

In [None]:
def suit_for_pandas(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df


In [5]:
def create_df_from_conll_file(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df

In [None]:
ud_dev = suit_for_pandas(filepath_ud_dev)
spmrl_dev = suit_for_pandas(filepath_spmrl_dev)

In [6]:
ud_dev = create_df_from_conll_file(filepath_ud_dev)
spmrl_dev = create_df_from_conll_file(filepath_spmrl_dev)

In [8]:
spmrl_dev

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,2,num,_,_
1,2,אנשים,איש,NN,NN,gen=M|num=P,3,subj,_,_
2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A,0,ROOT,_,_
3,4,מ,מ,PREPOSITION,PREPOSITION,_,3,prepmod,_,_
4,5,תאילנד,תאילנד,NNP,NNP,_,4,pobj,_,_
5,6,ל,ל,PREPOSITION,PREPOSITION,_,3,prepmod,_,_
6,7,ישראל,ישראל,NNP,NNP,_,6,pobj,_,_
7,8,כש,כש,TEMP,TEMP,_,3,comp,_,_
8,9,הם,הוא,PRP,PRP,gen=M|num=P|per=3,10,subj,_,_
9,10,נרשמים,נרשם,BN,BN,gen=M|num=P|per=A,14,conj,_,_


In [None]:
ud_train = suit_for_pandas(filepath_ud_train)
spmrl_train = suit_for_pandas(filepath_spmrl_train)

In [None]:
ud_test = suit_for_pandas(filepath_ud_test)
spmrl_test = suit_for_pandas(filepath_spmrl_test)

In [9]:
ud_dev['sent_id'] = ''
spmrl_dev['sent_id'] = ''
spmrl_dev['ID'] = spmrl_dev['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
ud_train['sent_id'] = ''
spmrl_train['sent_id'] = ''
spmrl_train['ID'] = spmrl_train['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
ud_test['sent_id'] = ''
spmrl_test['sent_id'] = ''
spmrl_test['ID'] = spmrl_test['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [None]:
spmrl_treebank = pd.concat([spmrl_dev, spmrl_train, spmrl_test], ignore_index=True)
spmrl_treebank['sent_id'] = ''
spmrl_treebank['ID'] = spmrl_treebank['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [None]:
spmrl_train[(spmrl_train['DEPREL'] == 'posspmod') & (spmrl_train['XPOS'] != 'POS')]

In [None]:
spmrl_treebank

In [25]:
def sentence_id(df, tb):
    if tb=='ud':
        sent_id = 0
        for i, row in df.iterrows():
            if '# sent_id' in row['ID']:
                sent_id += 1
            else:
                df.at[i, 'sent_id'] = sent_id
    elif tb=='spmrl':
        sent_id = 1
        for i, row in df.iterrows():
            if row['ID'] == 0:
                df.at[i, 'sent_id'] = 0 
            try:
                if df.loc[i]['ID'] > df.loc[i-1]['ID']:
                    df.at[i, 'sent_id'] = sent_id
                elif type(df.loc[i-1]['ID']) == str:
                    continue
                else:
                    sent_id += 1
                    df.at[i, 'sent_id'] = sent_id
            except KeyError as e:
                df.at[i, 'sent_id'] = 1
            except TypeError as e:
                print(df.loc[i]['ID'],df.loc[i-1]['ID'], e)
            except ValueError as e:
                print(df.loc[i]['ID'] > df.loc[i-1]['ID'], e)
                
sentence_id(spmrl_dev, 'spmrl')
sentence_id(ud_dev, 'ud')

In [None]:
sentence_id(spmrl_treebank, 'spmrl')

In [None]:
spmrl_treebank

In [None]:
sentence_id(spmrl_train, 'spmrl')
sentence_id(ud_train, 'ud')

## Inspection

In [None]:
ud_dev#[ud_dev['FORM'].str.contains('כל')]

In [None]:
spmrl_dev[spmrl_dev['sent_id'] == 3]#['FEATS'].unique()

In [None]:
spmrl_dev#[spmrl_dev['sent_id'] == 4539].head(60)#['FEATS'].unique()

In [None]:
spmrl_dev[spmrl_dev['XPOS']== "S_PRN"]['FEATS'].unique()

In [None]:
spmrl_train[(spmrl_train['XPOS'] == 'DTT') & ~(spmrl_train['FEATS'].str.contains('suf_gen', na=False) )]#['FEATS'].unique()

In [None]:
spmrl_train[spmrl_train['sent_id'] == 106]

In [None]:
ud_train[(ud_train['LEMMA'] == 'או') ]#['XPOS'].unique()

In [None]:
ud_dev = suit_for_pandas('./data/he_htb-ud-dev.conllu')
ud_dev.columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']

In [None]:
spmrl_dev[spmrl_dev['XPOS'].str.contains('NN_S_PP', na=False)]

In [None]:
ud_dev[(ud_dev['XPOS'].str.contains('BN', na=False)) & (ud_dev['FEATS'].str.contains('HIFIL', na=False))]

In [None]:
spmrl_dev.to_csv('./data/spmrl-treebank/numbered_dev.csv')

## Segmentation

In [None]:
segmented_spmrl_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])

In [11]:
pronouns = {
     'suf_gen=F|suf_gen=M|suf_num=P|suf_per=1': '_אנחנו',
     'suf_gen=F|suf_gen=M|suf_num=S|suf_per=1': '_אני',
     'suf_gen=M|suf_num=S|suf_per=1': '_אני', # this is a mistake in sentence 899
     'suf_gen=M|suf_num=S|suf_per=2': '_אתה',
     'suf_gen=F|suf_num=S|suf_per=2': '_את',
     'suf_gen=M|suf_num=P|suf_per=2': '_אתם',
     'suf_gen=F|suf_num=P|suf_per=2': '_אתן',
     'suf_gen=F|suf_num=P|suf_per=3': '_הן',
     'suf_gen=F|suf_num=S|suf_per=3': '_היא',
     'suf_gen=M|suf_num=P|suf_per=3': '_הם',
     'suf_gen=M|suf_num=S|suf_per=3': '_הוא',
    'suf_gen=M|suf_num=S|per=3': '_הוא' # mistake at sentence 2348
}


In [36]:
def segment_df(unsegmented_df):
    output_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
    for i, row in unsegmented_df.iterrows():
        try:
            suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
            noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
            clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
            if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
        #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                                'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats, 'HEAD': row['HEAD'], 
                                                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
                                                                'XPOS': 'ADP','FEATS': '_', 'HEAD': int(row['ID']) + 2, 
                                                                'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                                'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 2, 
                                                                    'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
            elif row['XPOS'] == 'S_PRN':
                output_df.at[i-1, 'XPOS'] = 'ADP'
                output_df.at[i-1, 'FORM'] +=  '_'
                output_df.at[i-1, 'FEATS'] = 'Case=Gen'

                prev_feats = output_df.loc[i-1]['FEATS'] + '|'
                if prev_feats == '_|':
                    prev_feats = ''
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_' ,'LEMMA': row['LEMMA'],  'UPOS': 'PRON', 
                                    'XPOS': 'PRON', 'FEATS': prev_feats + 'PronType=Prs', 'HEAD': row['HEAD'], 
                                    'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

            elif row['XPOS'] == 'DTT' or row['XPOS'] == 'DT':
                if 'suf_' in row['FEATS']:
                    output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                                'XPOS': 'NOUN','FEATS': row['FEATS'], 'HEAD': row['HEAD'], 
                                                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

                    output_df = output_df.append({'ID': 0, 'FORM': "_" + pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                                'XPOS': 'PRON','FEATS': "Case=Gen|"+clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 1, 
                                                                'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
                else:
                    output_df = output_df.append(row, ignore_index=True)            
            elif row['XPOS'] == 'S_PRP':
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': row['UPOS'], 
                                                                'XPOS': row['XPOS'],'FEATS': row['FEATS'] + "|PronType=Prs|Reflex=Yes", 'HEAD': row['HEAD'], 
                                                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
            else:
                output_df = output_df.append(row, ignore_index=True)
        except KeyError as e:
            print(row)
    return output_df

In [None]:
seg_spmrl_train = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
for i, row in spmrl_train.iterrows():
    segement_df(row, seg_spmrl_train)

In [47]:
seg_spmrl_df = segment_df(spmrl_dev)

In [None]:
seg_spmrl_treebank = segment_df(spmrl_treebank)

## Conversion Inspection

In [None]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('suf', na=False)]['XPOS'].unique()

In [51]:
seg_spmrl_df[seg_spmrl_df['sent_id'] == 13]#['FEATS'].unique()

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
352,1,"ח""ך","ח""ך",NOUN,NN,Gender=Masc|Number=Sing,7,subj,_,_,13
353,2,רן,רן,PROPN,NNP,_,1,nn,_,_,13
354,3,כהן,כהן,PROPN,NNP,_,2,nn,_,_,13
355,4,(,_,PUNCT,yyLRB,_,1,punct,_,_,13
356,5,רץ,רצ,PROPN,NNP,_,1,appos,_,_,13
357,6,),_,PUNCT,yyRRB,_,1,punct,_,_,13
358,7,אמר,אמר,VERB,VB,Gender=Masc|Number=Sing|Person=3|Tense=Past,0,ROOT,_,_,13
359,8,כי,כי,CC,CC,_,7,comp,_,_,13
360,9,על,על,ADP,IN,_,12,ccomp,_,_(prd in clause),13
361,10,ה,ה,DET,DEF,PronType=Art,11,def,_,_,13


In [None]:
spmrl_train[spmrl_train['sent_id'] == 3029]#['XPOS'].unique()

In [None]:
seg_spmrl_df[(seg_spmrl_df['DEPREL'] == 'posspmod') & (seg_spmrl_df['XPOS'] != 'POS')]#['DEPREL'].unique()

In [None]:
spmrl_train[spmrl_train['FORM'].str.contains('לזרז')]

In [22]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('_\|') ]

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id


## Conversion

In [48]:
# gender
def simple_features_conversion(column, conversions):
    for old,new in conversions.items():
        column = column.replace(old,new)

    return column
    
basic_features = {'gen=F|gen=M': 'Gender=Fem,Masc', 'gen=F': 'Gender=Fem', 'gen=M':'Gender=Masc',
               'num=S':'Number=Sing',  'num=P': 'Number=Plur',
                'per=A': 'Person=1,2,3', 'per=': 'Person=', 
                'tense=BEINONI': 'VerbForm=Part', 'tense=TOINFINITIVE': 'VerbForm=Inf', 'tense=IMPERATIVE': 'Mood=Imp',
                'tense=PAST': 'Tense=Past', 'tense=FUTURE': 'Tense=Fut'
               }

seg_spmrl_df.loc[:, 'FEATS']  = seg_spmrl_df['FEATS'].apply(lambda x: simple_features_conversion(x, basic_features))

In [49]:
def pos_conversion(column, conversions):
    if column in conversions:
        return conversions[column]
    else:
        return column

basic_pos = {
         'IN': 'ADP', 'NNP': 'PROPN', 'JJ':'ADJ', 'NN': 'NOUN', 'VB': 'VERB', 'RB': 'ADV', 'NCD': 'NUM','NEG': 'ADV',
        'PREPOSITION': 'ADP', 'REL': 'SCONJ', 'COM': 'SCONJ', 'CONJ': 'CCONJ','POS': 'ADP', 'PRP': 'PRON',
        'yyCLN': 'PUNCT', 'yyCM': 'PUNCT', 'yyDASH': 'PUNCT', 'yyDOT': 'PUNCT', 'yyELPS': 'PUNCT', 'yyEXCL': 'PUNCT',
        'yyLRB': 'PUNCT', 'yyQM': 'PUNCT', 'yyQUOT': 'PUNCT', 'yyRRB': 'PUNCT', 'yySCLN': 'PUNCT', 'ZVL': 'X'
}
seg_spmrl_df.loc[:, 'UPOS']  = seg_spmrl_df['UPOS'].apply(lambda x: pos_conversion(x, basic_pos))    

In [50]:
def pos_convert_entire_line(row, conversions):
    xpos = row['XPOS']
    form = row['FORM']
#     if xpos in conversions:
    if xpos in conversions:
        if 'concat' in xpos:
            if xpos['concat'] == 'before':
                form = '_' + form
            elif xpos['concat'] == 'after':
                form += '_'
            else:
                form = '_' + form + '_'
        upos = conversions[xpos]['pos']
        if conversions[xpos]['deprel'] == 'deprel':
            deprel = row['DEPREL']
        else:
            deprel = conversions[xpos]['deprel']
        if conversions[xpos]['feats'] == 'feats':
            feats = row['FEATS']
        elif conversions[xpos]['feats']['old'] == '_':
            feats = conversions[xpos]['feats']['new']
        elif conversions[xpos]['feats']['old'] == 'feats+':
            if len(row['FEATS']) > 2:
                feats = row['FEATS'] + conversions[xpos]['feats']['new']
            else:
                feats = conversions[xpos]['feats']['new'][1:]
        elif conversions[xpos]['feats']['old'] == '+feats':
            feats = conversions[xpos]['feats']['new'] + row['FEATS']
        elif conversions[xpos]['feats']['old'] == '+feats+':
            feats = conversions[xpos]['feats']['new'][0] + row['FEATS'] + conversions[xpos]['feats']['new'][1]
        return pd.Series([upos, deprel, feats, form])
    else:
        return pd.Series([row['UPOS'], row['DEPREL'], row['FEATS'], row['FORM']])
    

entire_line_pos_conversion = {
    'AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'BN': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbForm=Part"}},
    'BNT': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Definite=Cons|', '|VerbForm=Part']}},
    'CD': {'pos': 'NUM', 'deprel': 'deprel', 'feats': 'feats'},
    'CDT': {'pos': 'NUM', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'NNT': {'pos': 'NOUN', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'COP': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbType=Cop|VerbForm=Part"}},
    'DEF': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Art'}, 'concat': 'after'},
    'EX': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'HebExistential=True'}},
    'P': {'pos': 'ADV', 'deprel': 'compound:affix', 'feats': {'old': '_', 'new': 'Prefix=True'}},
    'DUMMY_AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'JJT': {'pos': 'ADJ', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': 'Definite=Cons|'}},
    'MD': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|VerbType=Mod'}},
    'QW': {'pos': 'ADV', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Int'}},
    'TEMP': {'pos': 'SCONJ', 'deprel': 'mark', 'feats': {'old': '_', 'new': 'Case=Tem'}},
    'DTT': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'Definite=Cons'}},
    'S_ANP': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Case=Acc|', '|PronType=Prs']}} 

}

seg_spmrl_df[['UPOS', 'DEPREL', 'FEATS', 'FORM']] = seg_spmrl_df.apply(lambda x: pos_convert_entire_line(x, entire_line_pos_conversion), axis=1)

missing:
ADVERB - lexical?
CC - lexical decision?
DT/DTT - Already handled in segmentation
NN_S_PP -  Already handled in segmentation
S_PP - Already handled in segmentation
S_PRP - Already handled in segmentation
S_PRN - Already handled in segmentation
DEf - typo in spmrl tb
INTJ - doesn't change

NEG
S_ANP - Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs


POS - interaction DEPREL/POS
PRP - interaction DEPREL/POS

==========================================
changed:
AT, BN, BNT, CD, CDT, NNT, COP, DEF, EX, P, DUMMY_AT, JJT. MD. QW. TEMP
'IN', 'NNP', 'JJ', 'NN', 'VB'
'PREPOSITION', 'REL', 'COM', 'CONJ','POS', 'PRP'
'yyCLN', 'yyCM': 'yyDASH' 'yyDOT' 'yyELPS', 'yyEXCL', 'yyLRB', 'yyQM' 'ZVL'

===========================================

'CDT', 'NN', 'BN', 'PREPOSITION', 'NNP', 'TEMP', 'PRP', 'yyCM',
       'CC', 'RB', 'JJ', 'yyDOT', 'VB', 'NNT', 'DEF', 'CONJ', 'POS',
       'REL', 'yyLRB', 'yyRRB', 'yyQUOT', 'AT', 'NN_S_PP', 'CD', 'IN',
       'QW', 'S_PRN', 'BNT', 'P', 'yyDASH', 'MD', 'DTT', 'COP', 'JJT',
       'yyCLN', 'yySCLN', 'yyQM', 'yyEXCL', 'EX', 'yyELPS', 'DUMMY_AT',
       'ADVERB', '', 'INTJ', 'ZVL', 'S_PRP', 'NEG', 'NCD', 'DEf', 'S_ANP',
       'S_PP',

missing in conversion of form:
1. when the article is silent (e.g. l+h+memshala), the h needs to be prefixed by _ (though I'm not sure why not _h_ and also why the l isn't suffixed).
2. spaceAfter needs to be added. Record logic
