In [None]:
!tar -xvf ./data/hebtb.tar.gz

In [None]:
!pwd

In [45]:
import pandas as pd
import numpy as np
import csv
from tqdm import trange, tqdm

## Files 

In [2]:
filepath_spmrl_dev = './data/spmrl-treebank/dev_hebtb-gold.conll'
filepath_spmrl_train = './data/spmrl-treebank/train_hebtb-gold.conll'
filepath_spmrl_test = './data/spmrl-treebank/test_hebtb-gold.conll'

filepath_ud_dev = './data/ud-treebank/he_htb-ud-dev.conllu'
filepath_ud_train = './data/ud-treebank/he_htb-ud-train.conllu'
filepath_ud_test = './data/ud-treebank/he_htb-ud-test.conllu'


## DF Preparation

In [3]:
def create_df_from_conll_file(filepath):
    treebank = []
    columns = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=columns, na_filter=False, quoting=csv.QUOTE_NONE)
    except:
        with open(filepath, 'r') as source:
            for line in source.readlines():
                if len(line.split('\t')) == 10:
                    treebank.append(tuple(line.strip().split('\t')))
                elif len(line.split('\t')) == 1:
                    treebank.append((line.strip(), '', '', '', '', '', '', '', '', ''))
            df = pd.DataFrame(data=treebank, columns=columns)
    return df

In [86]:
ud_dev = create_df_from_conll_file(filepath_ud_dev)
spmrl_dev = create_df_from_conll_file(filepath_spmrl_dev)

In [5]:
ud_train = create_df_from_conll_file(filepath_ud_train)
spmrl_train = create_df_from_conll_file(filepath_spmrl_train)

In [6]:
ud_test = create_df_from_conll_file(filepath_ud_test)
spmrl_test = create_df_from_conll_file(filepath_spmrl_test)

In [87]:
ud_dev['sent_id'] = ''
spmrl_dev['sent_id'] = ''
spmrl_dev['ID'] = spmrl_dev['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [8]:
ud_train['sent_id'] = ''
spmrl_train['sent_id'] = ''
spmrl_train['ID'] = spmrl_train['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)


In [9]:
ud_test['sent_id'] = ''
spmrl_test['sent_id'] = ''
spmrl_test['ID'] = spmrl_test['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [10]:
spmrl_treebank = pd.concat([spmrl_dev, spmrl_train, spmrl_test], ignore_index=True)
# spmrl_treebank['sent_id'] = ''
# spmrl_treebank['ID'] = spmrl_treebank['ID'].apply(lambda x: int(x) if str(x).isdigit() else 0)

In [46]:
def sentence_id(df, tb):
    if tb=='ud':
        sent_id = 0
        for i, row in df.iterrows():
            if '# sent_id' in row['ID']:
                sent_id += 1
            else:
                df.at[i, 'sent_id'] = sent_id
    elif tb=='spmrl':
        sent_id = 1
        for i, row in tqdm(df.iterrows(), total=df.shape[0]):
            if row['ID'] == 0:
                df.at[i, 'sent_id'] = 0 
            try:
                if df.loc[i]['ID'] > df.loc[i-1]['ID']:
                    df.at[i, 'sent_id'] = sent_id
                elif type(df.loc[i-1]['ID']) == str:
                    continue
                else:
                    sent_id += 1
                    df.at[i, 'sent_id'] = sent_id
            except KeyError as e:
                df.at[i, 'sent_id'] = 1
            except TypeError as e:
                print(df.loc[i]['ID'],df.loc[i-1]['ID'], e)
            except ValueError as e:
                print(df.loc[i]['ID'] > df.loc[i-1]['ID'], e)

sentence_id(ud_dev, 'ud')                
sentence_id(spmrl_dev, 'spmrl')


100%|██████████| 11301/11301 [00:03<00:00, 2918.39it/s]


In [None]:
sentence_id(spmrl_treebank, 'spmrl')

In [None]:
spmrl_treebank

In [None]:
sentence_id(spmrl_train, 'spmrl')
sentence_id(ud_train, 'ud')

### Inspection

In [35]:
# gives an example sentence for each POS/relation
sentences = []
for rel in spmrl_treebank['DEPREL'].unique():
        enum = 0
        sentence_no = spmrl_treebank[spmrl_treebank['DEPREL'] == rel]['sent_id'].unique()[enum]
        while sentence_no in sentences:
            enum += 1
            sentence_no = spmrl_treebank[spmrl_treebank['DEPREL'] == rel]['sent_id'].unique()[enum]
    sentence = spmrl_treebank[spmrl_treebank['sent_id'] == sentence_no]
    word = sentence[sentence['DEPREL'] == rel]['FORM'].unique()[0]
    forms = " ".join([row['FORM'] for i, row in sentence.iterrows()])
    sentences.append((sentence_no))
#     print(tag, word, forms)
    print("{:10}{:10}sentence no:{}: {:10}".format(rel, word, sentence_no, forms))
#     print(word)
#     print(sentence_no, ": ", forms)

dicourse  ש         sentence no:2: תופעה זו התבררה אתמול ב וועדת ה עבודה ו ה רווחה של ה כנסת , ש דנה ב נושא העסקת עובדים זרים .


In [30]:
spmrl_treebank[spmrl_treebank['XPOS'] == 'POS']['FEATS'].

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
31,12,של,של,POS,POS,_,6,posspmod,_,_,2
64,22,של,של,POS,POS,_,21,posspmod,_,_,3
137,27,של,של,POS,POS,_,26,gen,_,_,5
518,17,של,של,POS,POS,_,16,gen,_,_,19
634,2,של,של,POS,POS,_,1,gen,_,_,24
690,26,של,של,POS,POS,_,25,gen,_,_,25
891,5,של,של,POS,POS,_,2,posspmod,_,_,31
1002,8,של,של,POS,POS,_,7,gen,_,_,36
1229,16,של,של,POS,POS,_,14,posspmod,_,_,48
1256,14,של,של,POS,POS,_,11,posspmod,_,_,49


In [29]:
ud_dev[ud_dev['sent_id'] == 56]

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
2010,# text = אני רוצה שהילד שלי יסתכל בעיניים של ה...,,,,,,,,,,56
2011,1,אני,הוא,PRON,PRON,"Gender=Fem,Masc|Number=Sing|Person=1|PronType=Prs",2,dep,_,_,56
2012,2,רוצה,רצה,VERB,VERB,Gender=Masc|HebBinyan=PAAL|Number=Sing|Person=...,0,root,_,_,56
2013,3-5,שהילד,_,_,_,_,_,_,_,_,56
2014,3,ש,ש,SCONJ,SCONJ,_,8,mark,_,_,56
2015,4,ה,ה,DET,DET,PronType=Art,5,det:def,_,_,56
2016,5,ילד,ילד,NOUN,NOUN,Gender=Masc|Number=Sing,8,nsubj,_,_,56
2017,6-7,שלי,_,_,_,_,_,_,_,_,56
2018,6,של_,של,ADP,ADP,Case=Gen,7,case:gen,_,_,56
2019,7,_אני,הוא,PRON,PRON,"Gender=Fem,Masc|Number=Sing|Person=1|PronType=Prs",5,nmod:poss,_,_,56


In [26]:
spmrl_dev.loc[(spmrl_dev[spmrl_dev['XPOS'] == 'S_PRN'].index-1)]#['DEPREL'].unique()

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
73,31,ל,ל,IN,IN,_,30,prepmod,_,_,3
236,26,ל,ל,IN,IN,_,25,prepmod,_,_,8
315,11,לפי,לפי,IN,IN,_,13,prepmod,_,_,11
467,6,ל,ל,IN,IN,_,5,prepmod,_,_,17
727,14,אלי,אל,IN,IN,_,16,prepmod,_,_,26
912,26,עלי,על,IN,IN,_,23,conj,_,_,31
915,29,עלי,על,IN,IN,_,28,prepmod,_,_,31
924,7,ל,ל,IN,IN,_,6,prepmod,_,_,32
970,33,מ,מן,IN,IN,_,32,prepmod,_,_,34
1055,9,ב,ב,IN,IN,_,11,prepmod,_,_,38


In [39]:
ud_dev.loc[(ud_dev[ud_dev['FORM'].isin(['_של_', 'את_', 'ל_'])].index+1)]['XPOS'].unique()

array(['PRON'], dtype=object)

In [51]:
ud_dev[(ud_dev['XPOS'] == 'ADP') & ~(ud_dev['FEATS'].str.contains('Case=Gen', na=False))]

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
6,4,מ,מ,ADP,ADP,_,5,case,_,_,1
9,6,ל,ל,ADP,ADP,_,7,case,_,_,1
16,11,כ,כ,ADP,ADP,_,12,case,_,_,1
33,5,ב,ב,ADP,ADP,_,6,case,_,_,2
51,18,ב,ב,ADP,ADP,_,19,case,_,_,2
78,18,ל,ל,ADP,ADP,_,19,case,_,_,3
80,20,על,על,ADP,ADP,_,21,case,_,_,3
85,24,מ,מ,ADP,ADP,_,25,case,_,_,3
89,28,כדי,כדי,ADP,ADP,_,30,case,_,_,3
93,31,ל_,ל,ADP,ADP,_,32,case,_,_,3


### Segmentation

In [None]:
segmented_spmrl_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])

In [43]:
pronouns = {
     'suf_gen=F|suf_gen=M|suf_num=P|suf_per=1': '_אנחנו',
     'suf_gen=F|suf_gen=M|suf_num=S|suf_per=1': '_אני',
     'suf_gen=M|suf_num=S|suf_per=1': '_אני', # this is a mistake in sentence 899
     'suf_gen=M|suf_num=S|suf_per=2': '_אתה',
     'suf_gen=F|suf_num=S|suf_per=2': '_את',
     'suf_gen=M|suf_num=P|suf_per=2': '_אתם',
     'suf_gen=F|suf_num=P|suf_per=2': '_אתן',
     'suf_gen=F|suf_num=P|suf_per=3': '_הן',
     'suf_gen=F|suf_num=S|suf_per=3': '_היא',
     'suf_gen=M|suf_num=P|suf_per=3': '_הם',
     'suf_gen=M|suf_num=S|suf_per=3': '_הוא',
    'suf_gen=M|suf_num=S|per=3': '_הוא' # mistake at sentence 2348
}


In [None]:
def re_segment_df(df):
    for i, row in df.iterrows():
        

In [47]:
def segment_df(unsegmented_df):
    output_df = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 
                                      'sent_id'])
    for i, row in tqdm(unsegmented_df.iterrows(), total=unsegmented_df.shape[0]):
        try:
            suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
            noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
            clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
            if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
        #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
                output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  
                                              'UPOS': 'NOUN', 'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats + '|xx_UD=Seg', 
                                              'HEAD': row['HEAD'], 'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 
                                              'MISC': row['MISC'], 'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
                                              'XPOS': 'ADP','FEATS': '_'  + '|xx_UD=Seg', 'HEAD': int(row['ID']) + 2, 
                                              'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],
                                              'sent_id': row['sent_id']}, ignore_index=True)

                output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
                                              'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs"+'|xx_UD=Seg', 
                                              'HEAD': int(row['ID']) + 2, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
                                              'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)

            elif row['XPOS'] == 'DTT' or row['XPOS'] == 'DT':
                if 'suf_' in row['FEATS']:
                    output_df = output_df.append({'ID': row['ID'], 'FORM': row['FORM'], 'LEMMA': row['LEMMA'],  'UPOS': 'NOUN', 
                                                'XPOS': 'NOUN','FEATS': row['FEATS'], 'HEAD': row['HEAD'], 
                                                'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 'MISC': row['MISC'],
                                              'sent_id': row['sent_id']}, ignore_index=True)

                    output_df = output_df.append({'ID': 0, 'FORM': "_" + pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON',
                                                  'XPOS': 'PRON','FEATS': "Case=Gen|"+clean_suffix_feats + "|PronType=Prs",
                                                  'HEAD': int(row['ID']) + 1, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
                                                  'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)
                else:
                    output_df = output_df.append(row, ignore_index=True)            
            else:
                output_df = output_df.append(row, ignore_index=True)
        except KeyError as e:
            print(row)
    return output_df

In [None]:
def re_segment_df(df, segmentations):
    for i, row in df.iterrows():
        suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
        noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
        clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
        if row['XPOS'] == 'NN':
            df.loc[i+0.5] =  ['0', 'שובל', 'שובל', 'NNP', 'NNP', 'gen=M|num=P|per=A', '10', 'obj', '_', '_', 'from file']
    df.index = df.index + 1
    df = df.sort_index()
    return df


#             suffix_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' in x])
#             noun_feats = "|".join([x for x in row['FEATS'].split("|") if 'suf' not in x])
#             clean_suffix_feats = "|".join([x.replace("suf_", "") for x in row['FEATS'].split("|") if 'suf' in x])
#             if 'suf_' in row['FEATS'] and row['UPOS'] == 'NN':
#         #     if row['XPOS'] == 'NN_S_PP' or row['XPOS'] == 'S_PP':
#                 output_df = output_df.append({'ID': row['ID'], 'FORM': row['LEMMA'] + '_', 'LEMMA': row['LEMMA'],  
#                                               'UPOS': 'NOUN', 'XPOS': 'NOUN','FEATS': 'Definite=Def|' + noun_feats + '|xx_UD=Seg', 
#                                               'HEAD': row['HEAD'], 'DEPREL': row['DEPREL'], 'DEPS': row['DEPS'], 
#                                               'MISC': row['MISC'], 'sent_id': row['sent_id']}, ignore_index=True)

#                 output_df = output_df.append({'ID': 0, 'FORM': '_של_', 'LEMMA': 'של',  'UPOS': 'ADP', 
#                                               'XPOS': 'ADP','FEATS': '_'  + '|xx_UD=Seg', 'HEAD': int(row['ID']) + 2, 
#                                               'DEPREL': 'case:gen', 'DEPS': row['DEPS'], 'MISC': row['MISC'],
#                                               'sent_id': row['sent_id']}, ignore_index=True)

#                 output_df = output_df.append({'ID': 0, 'FORM': pronouns[suffix_feats], 'LEMMA': 'הוא',  'UPOS': 'PRON', 
#                                               'XPOS': 'PRON','FEATS': "Case=Gen|" + clean_suffix_feats + "|PronType=Prs"+'|xx_UD=Seg, 
#                                               'HEAD': int(row['ID']) + 2, 'DEPREL': 'nmod:poss', 'DEPS': row['DEPS'], 
#                                               'MISC': row['MISC'],'sent_id': row['sent_id']}, ignore_index=True)


In [None]:
seg_spmrl_train = pd.DataFrame(columns=['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 'sent_id'])
for i, row in spmrl_train.iterrows():
    segement_df(row, seg_spmrl_train)

In [88]:
seg_spmrl_df = segment_df(spmrl_dev)


  0%|          | 0/11301 [00:00<?, ?it/s][A
  0%|          | 40/11301 [00:00<00:28, 391.76it/s][A
  1%|          | 85/11301 [00:00<00:27, 406.56it/s][A
  1%|          | 130/11301 [00:00<00:26, 416.24it/s][A
  2%|▏         | 171/11301 [00:00<00:26, 412.81it/s][A
  2%|▏         | 213/11301 [00:00<00:26, 413.22it/s][A
  2%|▏         | 256/11301 [00:00<00:26, 417.12it/s][A
  3%|▎         | 298/11301 [00:00<00:26, 416.19it/s][A
  3%|▎         | 338/11301 [00:00<00:26, 408.60it/s][A
  3%|▎         | 379/11301 [00:00<00:26, 408.04it/s][A
  4%|▎         | 419/11301 [00:01<00:27, 402.04it/s][A
  4%|▍         | 459/11301 [00:01<00:27, 398.07it/s][A
  4%|▍         | 499/11301 [00:01<00:27, 390.94it/s][A
  5%|▍         | 538/11301 [00:01<00:28, 382.11it/s][A
  5%|▌         | 579/11301 [00:01<00:27, 387.69it/s][A
  5%|▌         | 618/11301 [00:01<00:27, 385.63it/s][A
  6%|▌         | 657/11301 [00:01<00:28, 378.26it/s][A
  6%|▌         | 695/11301 [00:01<00:28, 371.40it/s][A
  6%

KeyboardInterrupt: 

In [None]:
seg_spmrl_treebank = segment_df(spmrl_treebank)

## Conversion Inspection

In [None]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('suf', na=False)]['XPOS'].unique()

In [49]:
seg_spmrl_df

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id
0,1,עשרות,עשר,CDT,CDT,gen=F|num=P,2,num,_,_,1
1,2,אנשים,איש,NN,NN,gen=M|num=P,3,subj,_,_,1
2,3,מגיעים,הגיע,BN,BN,gen=M|num=P|per=A,0,ROOT,_,_,1
3,4,מ,מ,PREPOSITION,PREPOSITION,_,3,prepmod,_,_,1
4,5,תאילנד,תאילנד,NNP,NNP,_,4,pobj,_,_,1
5,6,ל,ל,PREPOSITION,PREPOSITION,_,3,prepmod,_,_,1
6,7,ישראל,ישראל,NNP,NNP,_,6,pobj,_,_,1
7,8,כש,כש,TEMP,TEMP,_,3,comp,_,_,1
8,9,הם,הוא,PRP,PRP,gen=M|num=P|per=3,10,subj,_,_,1
9,10,נרשמים,נרשם,BN,BN,gen=M|num=P|per=A,14,conj,_,_,1


In [89]:
spmrl_dev[spmrl_dev['sent_id'] == 50]#['FEATS'].unique()

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC,sent_id



 22%|██▏       | 2536/11301 [00:20<00:30, 286.95it/s][A

In [None]:
spmrl_train[spmrl_train['sent_id'] == 3029]#['XPOS'].unique()

In [None]:
seg_spmrl_df[(seg_spmrl_df['DEPREL'] == 'posspmod') & (seg_spmrl_df['XPOS'] != 'POS')]#['DEPREL'].unique()

In [None]:
spmrl_train[spmrl_train['FORM'].str.contains('לזרז')]

In [None]:
seg_spmrl_df[seg_spmrl_df['FEATS'].str.contains('_\|') ]

## Conversion

In [84]:
def change_according_to_next_row(row):
    feats = row['FEATS']
    upos = row['XPOS']
    form = row['FORM']
    try:
        next_row = row.name + 1
        if seg_spmrl_df.at[next_row, 'XPOS'] == 'S_PRN':
            feats = 'Case=Gen'
            form = form + '_'
            upos = 'ADP'
        else:
            return pd.Series([form,upos,feats])
    except:
        return pd.Series([form,upos,feats])
    return pd.Series([form,upos,feats])
            
seg_spmrl_df.loc[:, ['FORM', 'UPOS', 'FEATS']] = seg_spmrl_df.apply(lambda x: change_according_to_next_row(x), axis=1)

In [73]:
# gender
def simple_features_conversion(column, conversions):
    for old,new in conversions.items():
        column = column.replace(old,new)

    return column
    
basic_features = {'suf_': '', 'gen=F|gen=M': 'Gender=Fem,Masc', 'gen=F': 'Gender=Fem', 'gen=M':'Gender=Masc',
               'num=S':'Number=Sing',  'num=P': 'Number=Plur',
                'per=A': 'Person=1,2,3', 'per=': 'Person=', 
                'tense=BEINONI': 'VerbForm=Part', 'tense=TOINFINITIVE': 'VerbForm=Inf', 'tense=IMPERATIVE': 'Mood=Imp',
                'tense=PAST': 'Tense=Past', 'tense=FUTURE': 'Tense=Fut'
               }

seg_spmrl_df.loc[:, 'FEATS']  = seg_spmrl_df['FEATS'].apply(lambda x: simple_features_conversion(x, basic_features))

In [74]:
def pos_conversion(column, conversions):
    if column in conversions:
        return conversions[column]
    else:
        return column

basic_pos = {
         'IN': 'ADP', 'NNP': 'PROPN', 'JJ':'ADJ', 'NN': 'NOUN', 'VB': 'VERB', 'RB': 'ADV', 'NCD': 'NUM','NEG': 'ADV',
        'PREPOSITION': 'ADP', 'REL': 'SCONJ', 'COM': 'SCONJ', 'CONJ': 'CCONJ','POS': 'ADP', 'PRP': 'PRON',
        'yyCLN': 'PUNCT', 'yyCM': 'PUNCT', 'yyDASH': 'PUNCT', 'yyDOT': 'PUNCT', 'yyELPS': 'PUNCT', 'yyEXCL': 'PUNCT',
        'yyLRB': 'PUNCT', 'yyQM': 'PUNCT', 'yyQUOT': 'PUNCT', 'yyRRB': 'PUNCT', 'yySCLN': 'PUNCT', 'ZVL': 'X'
}
seg_spmrl_df.loc[:, 'UPOS']  = seg_spmrl_df['UPOS'].apply(lambda x: pos_conversion(x, basic_pos))    

In [75]:
def pos_convert_entire_line(row, conversions):
    xpos = row['XPOS']
    form = row['FORM']
#     if xpos in conversions:
    if xpos in conversions:
        if 'concat' in xpos:
            if xpos['concat'] == 'before':
                form = '_' + form
            elif xpos['concat'] == 'after':
                form += '_'
            else:
                form = '_' + form + '_'
        upos = conversions[xpos]['pos']
        if conversions[xpos]['deprel'] == 'deprel':
            deprel = row['DEPREL']
        else:
            deprel = conversions[xpos]['deprel']
        if conversions[xpos]['feats'] == 'feats':
            feats = row['FEATS']
        elif conversions[xpos]['feats']['old'] == '_':
            feats = conversions[xpos]['feats']['new']
        elif conversions[xpos]['feats']['old'] == 'feats+':
            if len(row['FEATS']) > 2:
                feats = row['FEATS'] + conversions[xpos]['feats']['new']
            else:
                feats = conversions[xpos]['feats']['new'][1:]
        elif conversions[xpos]['feats']['old'] == '+feats':
            feats = conversions[xpos]['feats']['new'] + row['FEATS']
        elif conversions[xpos]['feats']['old'] == '+feats+':
            feats = conversions[xpos]['feats']['new'][0] + row['FEATS'] + conversions[xpos]['feats']['new'][1]
        return pd.Series([upos, deprel, feats, form])
    else:
        return pd.Series([row['FORM'], row['UPOS'], row['DEPREL'], row['FEATS']])
    

entire_line_pos_conversion = {
    'AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'BN': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbForm=Part"}},
    'BNT': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Definite=Cons|', '|VerbForm=Part']}},
    'CD': {'pos': 'NUM', 'deprel': 'deprel', 'feats': 'feats'},
    'CDT': {'pos': 'NUM', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'NNT': {'pos': 'NOUN', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': "Definite=Cons|"}},
    'COP': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': "|VerbType=Cop|VerbForm=Part"}},
    'DEF': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Art'}, 'concat': 'after'},
    'EX': {'pos': 'VERB', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'HebExistential=True'}},
    'P': {'pos': 'ADV', 'deprel': 'compound:affix', 'feats': {'old': '_', 'new': 'Prefix=True'}},
    'DUMMY_AT': {'pos': 'ADP', 'deprel': 'case:acc', 'feats': {'old': '_', 'new': 'Case=Acc'}},
    'JJT': {'pos': 'ADJ', 'deprel': 'deprel', 'feats': {'old': '+feats', 'new': 'Definite=Cons|'}},
    'MD': {'pos': 'AUX', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|VerbType=Mod'}},
    'QW': {'pos': 'ADV', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'PronType=Int'}},
    'TEMP': {'pos': 'SCONJ', 'deprel': 'mark', 'feats': {'old': '_', 'new': 'Case=Tem'}},
    'DTT': {'pos': 'DET', 'deprel': 'deprel', 'feats': {'old': '_', 'new': 'Definite=Cons'}},
    'S_ANP': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': '+feats+', 'new': ['Case=Acc|', '|PronType=Prs']}},
    'S_PRP': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|PronType=Prs|Reflex=Yes'}},
    'S_PRN': {'pos': 'PRON', 'deprel': 'deprel', 'feats': {'old': 'feats+', 'new': '|PronType=Prs'}}
}

seg_spmrl_df[['FORM', 'UPOS', 'DEPREL', 'FEATS']] = seg_spmrl_df.apply(lambda x: pos_convert_entire_line(x, entire_line_pos_conversion), axis=1)

missing:
ADVERB - lexical?
CC - lexical decision?
DT/DTT - Already handled in segmentation
NN_S_PP -  Already handled in segmentation
S_PP - Already handled in segmentation
S_PRP - Already handled in segmentation
S_PRN - Already handled in segmentation
DEf - typo in spmrl tb
INTJ - doesn't change

NEG
S_ANP - Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs


POS - interaction DEPREL/POS
PRP - interaction DEPREL/POS

==========================================
changed:
AT, BN, BNT, CD, CDT, NNT, COP, DEF, EX, P, DUMMY_AT, JJT. MD. QW. TEMP
'IN', 'NNP', 'JJ', 'NN', 'VB'
'PREPOSITION', 'REL', 'COM', 'CONJ','POS', 'PRP'
'yyCLN', 'yyCM': 'yyDASH' 'yyDOT' 'yyELPS', 'yyEXCL', 'yyLRB', 'yyQM' 'ZVL'

===========================================

'CDT', 'NN', 'BN', 'PREPOSITION', 'NNP', 'TEMP', 'PRP', 'yyCM',
       'CC', 'RB', 'JJ', 'yyDOT', 'VB', 'NNT', 'DEF', 'CONJ', 'POS',
       'REL', 'yyLRB', 'yyRRB', 'yyQUOT', 'AT', 'NN_S_PP', 'CD', 'IN',
       'QW', 'S_PRN', 'BNT', 'P', 'yyDASH', 'MD', 'DTT', 'COP', 'JJT',
       'yyCLN', 'yySCLN', 'yyQM', 'yyEXCL', 'EX', 'yyELPS', 'DUMMY_AT',
       'ADVERB', '', 'INTJ', 'ZVL', 'S_PRP', 'NEG', 'NCD', 'DEf', 'S_ANP',
       'S_PP',

missing in conversion of form:
1. when the article is silent (e.g. l+h+memshala), the h needs to be prefixed by _ (though I'm not sure why not _h_ and also why the l isn't suffixed).
2. spaceAfter needs to be added. Record logic
