In [None]:
!tar -xvf ./data/hebtb.tar.gz

In [None]:
!pwd

In [None]:
import pandas as pd
import numpy as np
import csv

## Files 

In [None]:
filepath_spmrl_dev = './data/spmrl-treebank/dev_hebtb-gold.conll'
filepath_spmrl_train = './data/spmrl-treebank/train_hebtb-gold.conll'
filepath_spmrl_test = './data/spmrl-treebank/test_hebtb-gold.conll'

filepath_ud_dev = './data/ud-treebank/he_htb-ud-dev.conllu'
filepath_ud_train = './data/ud-treebank/he_htb-ud-train.conllu'
filepath_ud_test = './data/ud-treebank/he_htb-ud-test.conllu'


## DF Preparation

In [None]:
def suit_for_pandas(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df


In [None]:
ud_dev = suit_for_pandas(filepath_ud_dev)
spmrl_dev = suit_for_pandas(filepath_spmrl_dev)

In [None]:
ud_train = suit_for_pandas(filepath_ud_train)
spmrl_train = suit_for_pandas(filepath_spmrl_train)

In [None]:
ud_dev['sent_id'] = ''
spmrl_dev['sent_id'] = ''
spmrl_dev['ID'] = spmrl_dev['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
ud_train['sent_id'] = ''
spmrl_train['sent_id'] = ''
spmrl_train['ID'] = spmrl_train['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [None]:
def sentence_id(df, tb):
    if tb=='ud':
        sent_id = 0
        for i, row in df.iterrows():
            if '# sent_id' in row['ID']:
                sent_id += 1
            else:
                df.at[i, 'sent_id'] = sent_id
    elif tb=='spmrl':
        sent_id = 1
        for i, row in df.iterrows():
            if row['ID'] == 0:
                df.at[i, 'sent_id'] = 0 
            try:
                if df.loc[i]['ID'] > df.loc[i-1]['ID']:
                    df.at[i, 'sent_id'] = sent_id
                elif type(df.loc[i-1]['ID']) == str:
                    continue
                else:
                    sent_id += 1
                    df.at[i, 'sent_id'] = sent_id
            except KeyError as e:
                df.at[i, 'sent_id'] = 1
            except TypeError as e:
                print(df.loc[i]['ID'],df.loc[i-1]['ID'], e)
                
sentence_id(spmrl_dev, 'spmrl')
sentence_id(ud_dev, 'ud')

In [None]:
sentence_id(spmrl_train, 'spmrl')
sentence_id(ud_train, 'ud')

## Inspection

In [None]:
ud_dev#[ud_dev['FORM'].str.contains('כל')]

In [None]:
spmrl_dev[spmrl_dev['sent_id'] == 3]#['FEATS'].unique()

In [None]:
spmrl_dev#[spmrl_dev['sent_id'] == 4539].head(60)#['FEATS'].unique()

In [None]:
spmrl_dev[spmrl_dev['XPOS']== "S_PRN"]['FEATS'].unique()

In [None]:
spmrl_train[(spmrl_train['XPOS'] == 'DTT') & ~(spmrl_train['FEATS'].str.contains('suf_gen', na=False) )]#['FEATS'].unique()

In [None]:
spmrl_train[spmrl_train['sent_id'] == 106]

In [None]:
ud_train[(ud_train['LEMMA'] == 'או') ]#['XPOS'].unique()

In [None]:
ud_dev = suit_for_pandas('./data/he_htb-ud-dev.conllu')
ud_dev.columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']

In [None]:
spmrl_dev[spmrl_dev['XPOS'].str.contains('NN_S_PP', na=False)]

In [None]:
ud_dev[(ud_dev['XPOS'].str.contains('BN', na=False)) & (ud_dev['FEATS'].str.contains('HIFIL', na=False))]

In [None]:
spmrl_dev.to_csv('./data/spmrl-treebank/numbered_dev.csv')

## Segmentation

In [None]:
segmented_spmrl_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])

In [None]:
pronouns = {
     'suf_gen=F|suf_gen=M|suf_num=P|suf_per=1': '_אנחנו',
     'suf_gen=F|suf_gen=M|suf_num=S|suf_per=1': '_אני',
     'suf_gen=M|suf_num=S|suf_per=2': '_אתה',
     'suf_gen=F|suf_num=S|suf_per=2': '_את',
     'suf_gen=M|suf_num=P|suf_per=2': '_אתם',
     'suf_gen=F|suf_num=P|suf_per=2': '_אתן',
     'suf_gen=F|suf_num=P|suf_per=3': '_הן',
     'suf_gen=F|suf_num=S|suf_per=3': '_היא',
     'suf_gen=M|suf_num=P|suf_per=3': '_הם',
     'suf_gen=M|suf_num=S|suf_per=3': '_הוא'
}

pronouns

In [None]:
for i, row in spmrl_dev.iterrows():
    suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
    noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
    clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
    if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
#     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
        segmented_spmrl_df = segmented_spmrl_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                        'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats, 'HEAD': row['HEAD'], 
                                                        'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

        segmented_spmrl_df = segmented_spmrl_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
                                                        'XPOS': 'ADP','FEATS': '_', 'HEAD': int(row['ID']) + 2, 
                                                        'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

        segmented_spmrl_df = segmented_spmrl_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                        'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 2, 
                                                            'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
    elif row['XPOS'] == 'S_PRN':
        segmented_spmrl_df.at[i-1, 'XPOS'] = 'ADP'
        segmented_spmrl_df = segmented_spmrl_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_' ,'LEMMA': row['LEMMA'],  'UPOS': 'PRON', 
                            'XPOS': 'PRON', 'FEATS': segmented_spmrl_df.loc[i-1]['FEATS'] + '|PronType=Prs', 'HEAD': row['HEAD'], 
                            'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
        segmented_spmrl_df.at[i-1, 'FEATS'] = 'Case=Gen'

    elif row['XPOS'] == 'DTT' or row['XPOS'] == 'DT':
        if 'suf_' in row['FEATS']:
            segmented_spmrl_df = segmented_spmrl_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                        'XPOS': 'NOUN','FEATS': row['FEATS'], 'HEAD': row['HEAD'], 
                                                        'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
        
            segmented_spmrl_df = segmented_spmrl_df.append({'ID': 0, 'FORM': "_" + pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                        'XPOS': 'PRON','FEATS': "Case=Gen|"+clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 1, 
                                                        'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
        else:
            segmented_spmrl_df = segmented_spmrl_df.append(row, ignore_index=True)            
    elif row['XPOS'] == 'S_PRP':
        segmented_spmrl_df = segmented_spmrl_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': row['UPOS'], 
                                                        'XPOS': row['XPOS'],'FEATS': row['FEATS'] + "|PronType=Prs|Reflex=Yes", 'HEAD': row['HEAD'], 
                                                        'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
    else:
        segmented_spmrl_df = segmented_spmrl_df.append(row, ignore_index=True)

In [None]:
def segement_df(unsegmented_df):
    output_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
    for i, row in unsegmented_df.iterrows():
        suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
        noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
        clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
        if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
    #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
            output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                            'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats, 'HEAD': row['HEAD'], 
                                                            'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

            output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
                                                            'XPOS': 'ADP','FEATS': '_', 'HEAD': int(row['ID']) + 2, 
                                                            'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

            output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                            'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 2, 
                                                                'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
        elif row['XPOS'] == 'S_PRN':
            output_df.at[i-1, 'XPOS'] = 'ADP'
            output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_' ,'LEMMA': row['LEMMA'],  'UPOS': 'PRON', 
                                'XPOS': 'PRON', 'FEATS': output_df.loc[i-1]['FEATS'] + '|PronType=Prs', 'HEAD': row['HEAD'], 
                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
            output_df.at[i-1, 'FEATS'] = 'Case=Gen'

        elif row['XPOS'] == 'DTT' or row['XPOS'] == 'DT':
            if 'suf_' in row['FEATS']:
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                            'XPOS': 'NOUN','FEATS': row['FEATS'], 'HEAD': row['HEAD'], 
                                                            'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': "_" + pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                                            'XPOS': 'PRON','FEATS': "Case=Gen|"+clean_suffix_feats + "|PronType=Prs", 'HEAD': int(row['ID']) + 1, 
                                                            'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
            else:
                output_df = output_df.append(row, ignore_index=True)            
        elif row['XPOS'] == 'S_PRP':
            output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': row['UPOS'], 
                                                            'XPOS': row['XPOS'],'FEATS': row['FEATS'] + "|PronType=Prs|Reflex=Yes", 'HEAD': row['HEAD'], 
                                                            'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
        else:
            output_df = output_df.append(row, ignore_index=True)
    return output_df

In [None]:
seg_spmrl_train = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
for i, row in spmrl_train.iterrows():
    segement_df(row, seg_spmrl_train)

In [None]:
seg_spmrl_df = segement_df(spmrl_dev)

In [None]:
seg_spmrl_df

In [None]:
segmented_spmrl_df[segmented_spmrl_df['sent_id'] == 5]#['FEATS'].unique()

In [None]:
len(ud_dev_no_comment)

In [None]:
len(segmented_spmrl_df)

## Alignment

In [None]:
ud_dev[ud_dev['ID'].str.contains("#")]

In [None]:
segmented_spmrl_df

In [None]:
ud_dev_no_comment = ud_dev[~ud_dev['ID'].str.contains('#|-')].reset_index(drop=True)

In [None]:
for i, row in ud_dev_no_comment.iterrows(): 
    if spmrl_dev.loc[i]['FORM'] != row['FORM']:
        if '_' not in row['FORM']:
            print(segmented_spmrl_df.loc[i][['ID', 'FORM', 'sent_id']], ud_dev_no_comment.loc[i][['ID', 'FORM', 'sent_id']])

In [None]:
ud_dev_no_comment

In [None]:
ud_dev[ud_dev['sent_id'] == 4]

In [None]:
segmented_spmrl_df[(segmented_spmrl_df['ID'] == 0) & (segmented_spmrl_df['MISC'] == '_')]

## Conversion Inspection

In [None]:
segmented_spmrl_df[segmented_spmrl_df['FEATS'].str.contains('gen=', na=False)]['XPOS'].unique()

## Conversion

In [None]:
# gender
def simple_features_conversion(column, conversions):
    for old,new in conversions.items():
        column = column.replace(old,new)

    return column
    
basic_features = {'gen=F|gen=M': 'Gender=Fem,Masc', 'gen=F': 'Gender=Fem', 'gen=M':'Gender=Masc',
               'num=S':'Number=Sing',  'num=P': 'Number=Plur',
                'per=A': 'Person=1,2,3', 'per=': 'Person=', 
                'tense=BEINONI': 'VerbForm=Part', 'tense=TOINFINITIVE': 'VerbForm=Inf', 'tense=IMPERATIVE': 'Mood=Imp',
                'tense=PAST': 'Tense=Past', 'tense=FUTURE': 'Tense=Fut'
               }

seg_spmrl_df.loc[:, 'FEATS']  = seg_spmrl_df['FEATS'].apply(lambda x: simple_features_conversion(x, basic_features))

In [None]:
seg_spmrl_df#[seg_spmrl_df['FORM'] == 'היה']#['FEATS'].unique()

In [None]:
ud_train[(ud_train['FEATS'].str.contains('HebExistential=True'))]# & (ud_train['XPOS'].str.contains('VERB'))]['FEATS'].unique()

In [None]:
def pos_conversion(column, conversions):
    if column in conversions:
        column = conversions[column]
    return column

basic_pos = {
        'REL': 'SCONJ', 'COM': 'SCONJ', 'CONJ': 'CCONJ', 'IN': 'ADP', 'NNP': 'PROPN', 'PREPOSITION': 'ADP',
        'yyCLN':'PUNCT', 'yyCM':'PUNCT', 'yyDASH':'PUNCT', 'yyDOT':'PUNCT', 'yyELPS':'PUNCT', 
        'yyEXCL':'PUNCT', 'yyLRB':'PUNCT', 'yyQM':'PUNCT' 
}
seg_spmrl_df.loc[:, 'UPOS']  = seg_spmrl_df['XPOS'].apply(lambda x: pos_conversion(x, basic_pos))    

In [None]:
def pos_convert_entire_line(row, conversions):
    xpos = row['XPOS']
#     if xpos in conversions:
    if xpos in entire_line_pos_conversion:
        upos = conversions[xpos]['pos']
        if conversions[xpos]['deprel'] == 'deprel':
            deprel = row['DEPREL']
        else:
            deprel = conversions[xpos]['deprel']
        if conversions[xpos]['feats'] == 'feats':
            feats = row['FEATS']
        elif conversions[xpos]['feats']['old'] == '_':
            feats = conversions[xpos]['feats']['new']
        elif conversions[xpos]['feats']['old'] == 'feats+':
            feats = row['FEATS'] + conversions[xpos]['feats']['new']
        elif conversions[xpos]['feats']['old'] == '+feats':
            feats = conversions[xpos]['feats']['new'] + row['FEATS']
        elif conversions[xpos]['feats']['old'] == '+feats+':
            feats = conversions[xpos]['feats']['new'][0] + row['FEATS'] + conversions[xpos]['feats']['new'][1]
        return pd.Series([upos, deprel, feats])
    else:
        return pd.Series([row['XPOS'], row['DEPREL'], row['FEATS']])
    

entire_line_pos_conversion = {
    'AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'BN': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbForm=Part"}},
    'BNT': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Definite=Cons|', '|VerbForm=Part']}},
    'CD': {'pos': 'NUM', 'deprel': 'deprel', 'feats': 'feats'},
    'CDT': {'pos': 'NUM', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'COP': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbType=Cop|VerbForm=Part"}},
    'DEF': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Art'}},
    'EX': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'HebExistential=True'}},


}

seg_spmrl_df[['UPOS', 'DEPREL', 'FEATS']] = seg_spmrl_df.apply(lambda x: pos_convert_entire_line(x, entire_line_pos_conversion), axis=1)