In [None]:
!tar -xvf ./data/hebtb.tar.gz

In [None]:
!pwd

In [None]:
import pandas as pd
import numpy as np
import csv
from tqdm import trange

## Files 

In [None]:
filepath_spmrl_dev = './data/spmrl-treebank/dev_hebtb-gold.conll'
filepath_spmrl_train = './data/spmrl-treebank/train_hebtb-gold.conll'
filepath_spmrl_test = './data/spmrl-treebank/test_hebtb-gold.conll'

filepath_ud_dev = './data/ud-treebank/he_htb-ud-dev.conllu'
filepath_ud_train = './data/ud-treebank/he_htb-ud-train.conllu'
filepath_ud_test = './data/ud-treebank/he_htb-ud-test.conllu'


## DF Preparation

In [None]:
def suit_for_pandas(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df


In [None]:
def create_df_from_conll_file(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df

In [None]:
ud_dev = suit_for_pandas(filepath_ud_dev)
spmrl_dev = suit_for_pandas(filepath_spmrl_dev)

In [None]:
ud_dev = create_df_from_conll_file(filepath_ud_dev)
spmrl_dev = create_df_from_conll_file(filepath_spmrl_dev)

In [None]:
spmrl_dev

In [None]:
ud_train = suit_for_pandas(filepath_ud_train)
spmrl_train = suit_for_pandas(filepath_spmrl_train)

In [None]:
ud_test = suit_for_pandas(filepath_ud_test)
spmrl_test = suit_for_pandas(filepath_spmrl_test)

In [None]:
ud_dev['sent_id'] = ''
spmrl_dev['sent_id'] = ''
spmrl_dev['ID'] = spmrl_dev['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
ud_train['sent_id'] = ''
spmrl_train['sent_id'] = ''
spmrl_train['ID'] = spmrl_train['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
ud_test['sent_id'] = ''
spmrl_test['sent_id'] = ''
spmrl_test['ID'] = spmrl_test['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [None]:
spmrl_treebank = pd.concat([spmrl_dev, spmrl_train, spmrl_test], ignore_index=True)
spmrl_treebank['sent_id'] = ''
spmrl_treebank['ID'] = spmrl_treebank['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [None]:
spmrl_train[(spmrl_train['DEPREL'] == 'posspmod') & (spmrl_train['XPOS'] != 'POS')]

In [None]:
spmrl_treebank

In [None]:
def sentence_id(df, tb):
    if tb=='ud':
        sent_id = 0
        for i, row in df.iterrows():
            if '# sent_id' in row['ID']:
                sent_id += 1
            else:
                df.at[i, 'sent_id'] = sent_id
    elif tb=='spmrl':
        sent_id = 1
        for i, row in df.iterrows():
            if row['ID'] == 0:
                df.at[i, 'sent_id'] = 0 
            try:
                if df.loc[i]['ID'] > df.loc[i-1]['ID']:
                    df.at[i, 'sent_id'] = sent_id
                elif type(df.loc[i-1]['ID']) == str:
                    continue
                else:
                    sent_id += 1
                    df.at[i, 'sent_id'] = sent_id
            except KeyError as e:
                df.at[i, 'sent_id'] = 1
            except TypeError as e:
                print(df.loc[i]['ID'],df.loc[i-1]['ID'], e)
            except ValueError as e:
                print(df.loc[i]['ID'] > df.loc[i-1]['ID'], e)
                
sentence_id(spmrl_dev, 'spmrl')
sentence_id(ud_dev, 'ud')

In [None]:
sentence_id(spmrl_treebank, 'spmrl')

In [None]:
spmrl_treebank

In [None]:
sentence_id(spmrl_train, 'spmrl')
sentence_id(ud_train, 'ud')

### Inspection

### Segmentation

In [None]:
segmented_spmrl_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])

In [None]:
pronouns = {
     'suf_gen=F|suf_gen=M|suf_num=P|suf_per=1': '_אנחנו',
     'suf_gen=F|suf_gen=M|suf_num=S|suf_per=1': '_אני',
     'suf_gen=M|suf_num=S|suf_per=1': '_אני', # this is a mistake in sentence 899
     'suf_gen=M|suf_num=S|suf_per=2': '_אתה',
     'suf_gen=F|suf_num=S|suf_per=2': '_את',
     'suf_gen=M|suf_num=P|suf_per=2': '_אתם',
     'suf_gen=F|suf_num=P|suf_per=2': '_אתן',
     'suf_gen=F|suf_num=P|suf_per=3': '_הן',
     'suf_gen=F|suf_num=S|suf_per=3': '_היא',
     'suf_gen=M|suf_num=P|suf_per=3': '_הם',
     'suf_gen=M|suf_num=S|suf_per=3': '_הוא',
    'suf_gen=M|suf_num=S|per=3': '_הוא' # mistake at sentence 2348
}


In [None]:
def re_segment_df(df):
    for i, row in df.iterrows():
        

In [None]:
def segment_df(unsegmented_df):
    output_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 
                                      'sent_id'])
    for i, row in unsegmented_df.iterrows():
        try:
            suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
            noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
            clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
            if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
        #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  
                                              'UPOS': 'NOUN', 'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats + '|xx_UD=Seg', 
                                              'HEAD': row['HEAD'], 'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 
                                              'MISC': row['MISC'], 'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
                                              'XPOS': 'ADP','FEATS': '_'  + '|xx_UD=Seg', 'HEAD': int(row['ID']) + 2, 
                                              'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],
                                              'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                              'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs"+'|xx_UD=Seg, 
                                              'HEAD': int(row['ID']) + 2, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
                                              'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

            elif row['XPOS'] == 'DTT' or row['XPOS'] == 'DT':
                if 'suf_' in row['FEATS']:
                    output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                'XPOS': 'NOUN','FEATS': row['FEATS'], 'HEAD': row['HEAD'], 
                                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],
                                              'sent_id': row['sent_id']}, ignore_index=True)

                    output_df = output_df.append({'ID': 0, 'FORM': "_" + pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON',
                                                  'XPOS': 'PRON','FEATS': "Case=Gen|"+clean_suffix_feats + "|PronType=Prs",
                                                  'HEAD': int(row['ID']) + 1, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
                                                  'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
                else:
                    output_df = output_df.append(row, ignore_index=True)            
            else:
                output_df = output_df.append(row, ignore_index=True)
        except KeyError as e:
            print(row)
    return output_df

In [None]:
def re_segment_df(df, segmentations):
    for i, row in df.iterrows():
        suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
        noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
        clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
        if row['XPOS'] == 'NN':
            df.loc[i+0.5] =  ['0', 'שובל', 'שובל', 'NNP', 'NNP', 'gen=M|num=P|per=A', '10', 'obj', '_', '_', 'from file']
    df.index = df.index + 1
    df = df.sort_index()
    return df


#             suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
#             noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
#             clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
#             if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
#         #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
#                 output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  
#                                               'UPOS': 'NOUN', 'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats + '|xx_UD=Seg', 
#                                               'HEAD': row['HEAD'], 'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 
#                                               'MISC': row['MISC'], 'sent_id': row['sent_id']}, ignore_index=True)

#                 output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
#                                               'XPOS': 'ADP','FEATS': '_'  + '|xx_UD=Seg', 'HEAD': int(row['ID']) + 2, 
#                                               'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],
#                                               'sent_id': row['sent_id']}, ignore_index=True)

#                 output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
#                                               'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs"+'|xx_UD=Seg, 
#                                               'HEAD': int(row['ID']) + 2, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
#                                               'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)


In [None]:
seg_spmrl_train = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
for i, row in spmrl_train.iterrows():
    segement_df(row, seg_spmrl_train)

In [None]:
seg_spmrl_df = segment_df(spmrl_dev)

In [None]:
seg_spmrl_treebank = segment_df(spmrl_treebank)

## Conversion Inspection

In [None]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('suf', na=False)]['XPOS'].unique()

In [None]:
seg_spmrl_df[seg_spmrl_df['sent_id'] == 13]#['FEATS'].unique()

In [None]:
spmrl_train[spmrl_train['sent_id'] == 3029]#['XPOS'].unique()

In [None]:
seg_spmrl_df[(seg_spmrl_df['DEPREL'] == 'posspmod') & (seg_spmrl_df['XPOS'] != 'POS')]#['DEPREL'].unique()

In [None]:
spmrl_train[spmrl_train['FORM'].str.contains('לזרז')]

In [None]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('_\|') ]

## Conversion

In [None]:
def change_previous_row(row):
    feats = row['FEATS']
    try:
        prev = row.name - 1
        prev_feats = spmrl_dev.at[prev, 'FEATS']
        if row['XPOS'] == 'PREPOSITION':
            spmrl_dev.at[prev, 'XPOS'] = 'ADP'
            spmrl_dev.at[prev, 'FEATS'] = 'Case=Gen'
            feats = prev_feats + '|PronType=Prs'
    except:
        return feats
    return feats
    
spmrl_dev['FEATS'] = spmrl_dev.apply(change_previous_row, axis=1)

In [None]:
# gender
def simple_features_conversion(column, conversions):
    for old,new in conversions.items():
        column = column.replace(old,new)

    return column
    
basic_features = {'suf_': '', 'gen=F|gen=M': 'Gender=Fem,Masc', 'gen=F': 'Gender=Fem', 'gen=M':'Gender=Masc',
               'num=S':'Number=Sing',  'num=P': 'Number=Plur',
                'per=A': 'Person=1,2,3', 'per=': 'Person=', 
                'tense=BEINONI': 'VerbForm=Part', 'tense=TOINFINITIVE': 'VerbForm=Inf', 'tense=IMPERATIVE': 'Mood=Imp',
                'tense=PAST': 'Tense=Past', 'tense=FUTURE': 'Tense=Fut'
               }

seg_spmrl_df.loc[:, 'FEATS']  = seg_spmrl_df['FEATS'].apply(lambda x: simple_features_conversion(x, basic_features))

In [None]:
def pos_conversion(column, conversions):
    if column in conversions:
        return conversions[column]
    else:
        return column

basic_pos = {
         'IN': 'ADP', 'NNP': 'PROPN', 'JJ':'ADJ', 'NN': 'NOUN', 'VB': 'VERB', 'RB': 'ADV', 'NCD': 'NUM','NEG': 'ADV',
        'PREPOSITION': 'ADP', 'REL': 'SCONJ', 'COM': 'SCONJ', 'CONJ': 'CCONJ','POS': 'ADP', 'PRP': 'PRON',
        'yyCLN': 'PUNCT', 'yyCM': 'PUNCT', 'yyDASH': 'PUNCT', 'yyDOT': 'PUNCT', 'yyELPS': 'PUNCT', 'yyEXCL': 'PUNCT',
        'yyLRB': 'PUNCT', 'yyQM': 'PUNCT', 'yyQUOT': 'PUNCT', 'yyRRB': 'PUNCT', 'yySCLN': 'PUNCT', 'ZVL': 'X'
}
seg_spmrl_df.loc[:, 'UPOS']  = seg_spmrl_df['UPOS'].apply(lambda x: pos_conversion(x, basic_pos))    

In [None]:
def pos_convert_entire_line(row, conversions):
    xpos = row['XPOS']
    form = row['FORM']
#     if xpos in conversions:
    if xpos in conversions:
        if 'concat' in xpos:
            if xpos['concat'] == 'before':
                form = '_' + form
            elif xpos['concat'] == 'after':
                form += '_'
            else:
                form = '_' + form + '_'
        upos = conversions[xpos]['pos']
        if conversions[xpos]['deprel'] == 'deprel':
            deprel = row['DEPREL']
        else:
            deprel = conversions[xpos]['deprel']
        if conversions[xpos]['feats'] == 'feats':
            feats = row['FEATS']
        elif conversions[xpos]['feats']['old'] == '_':
            feats = conversions[xpos]['feats']['new']
        elif conversions[xpos]['feats']['old'] == 'feats+':
            if len(row['FEATS']) > 2:
                feats = row['FEATS'] + conversions[xpos]['feats']['new']
            else:
                feats = conversions[xpos]['feats']['new'][1:]
        elif conversions[xpos]['feats']['old'] == '+feats':
            feats = conversions[xpos]['feats']['new'] + row['FEATS']
        elif conversions[xpos]['feats']['old'] == '+feats+':
            feats = conversions[xpos]['feats']['new'][0] + row['FEATS'] + conversions[xpos]['feats']['new'][1]
        return pd.Series([upos, deprel, feats, form])
    else:
        return pd.Series([row['FORM'], row['UPOS'], row['DEPREL'], row['FEATS']])
    

entire_line_pos_conversion = {
    'AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'BN': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbForm=Part"}},
    'BNT': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Definite=Cons|', '|VerbForm=Part']}},
    'CD': {'pos': 'NUM', 'deprel': 'deprel', 'feats': 'feats'},
    'CDT': {'pos': 'NUM', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'NNT': {'pos': 'NOUN', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'COP': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbType=Cop|VerbForm=Part"}},
    'DEF': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Art'}, 'concat': 'after'},
    'EX': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'HebExistential=True'}},
    'P': {'pos': 'ADV', 'deprel': 'compound:affix', 'feats': {'old': '_', 'new': 'Prefix=True'}},
    'DUMMY_AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'JJT': {'pos': 'ADJ', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': 'Definite=Cons|'}},
    'MD': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|VerbType=Mod'}},
    'QW': {'pos': 'ADV', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Int'}},
    'TEMP': {'pos': 'SCONJ', 'deprel': 'mark', 'feats': {'old': '_', 'new': 'Case=Tem'}},
    'DTT': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'Definite=Cons'}},
    'S_ANP': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Case=Acc|', '|PronType=Prs']}} 
    'S_PRP': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|PronType=Prs|Reflex=Yes'}}
}

seg_spmrl_df[['FORM', 'UPOS', 'DEPREL', 'FEATS']] = seg_spmrl_df.apply(lambda x: pos_convert_entire_line(x, entire_line_pos_conversion), axis=1)

missing:
ADVERB - lexical?
CC - lexical decision?
DT/DTT - Already handled in segmentation
NN_S_PP -  Already handled in segmentation
S_PP - Already handled in segmentation
S_PRP - Already handled in segmentation
S_PRN - Already handled in segmentation
DEf - typo in spmrl tb
INTJ - doesn't change

NEG
S_ANP - Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs


POS - interaction DEPREL/POS
PRP - interaction DEPREL/POS

==========================================
changed:
AT, BN, BNT, CD, CDT, NNT, COP, DEF, EX, P, DUMMY_AT, JJT. MD. QW. TEMP
'IN', 'NNP', 'JJ', 'NN', 'VB'
'PREPOSITION', 'REL', 'COM', 'CONJ','POS', 'PRP'
'yyCLN', 'yyCM': 'yyDASH' 'yyDOT' 'yyELPS', 'yyEXCL', 'yyLRB', 'yyQM' 'ZVL'

===========================================

'CDT', 'NN', 'BN', 'PREPOSITION', 'NNP', 'TEMP', 'PRP', 'yyCM',
       'CC', 'RB', 'JJ', 'yyDOT', 'VB', 'NNT', 'DEF', 'CONJ', 'POS',
       'REL', 'yyLRB', 'yyRRB', 'yyQUOT', 'AT', 'NN_S_PP', 'CD', 'IN',
       'QW', 'S_PRN', 'BNT', 'P', 'yyDASH', 'MD', 'DTT', 'COP', 'JJT',
       'yyCLN', 'yySCLN', 'yyQM', 'yyEXCL', 'EX', 'yyELPS', 'DUMMY_AT',
       'ADVERB', '', 'INTJ', 'ZVL', 'S_PRP', 'NEG', 'NCD', 'DEf', 'S_ANP',
       'S_PP',

missing in conversion of form:
1. when the article is silent (e.g. l+h+memshala), the h needs to be prefixed by _ (though I'm not sure why not _h_ and also why the l isn't suffixed).
2. spaceAfter needs to be added. Record logic
