In [17]:
import pandas as pd

pd.set_option('display.max_rows', None)

## Shevchenko db part

In [2]:
shev_words = pd.read_csv('../data/raw/shev4_words.txt', delimiter='\t', encoding='utf-16')

In [3]:
shev_words

Unnamed: 0,morfemid,wrdFK,mname,mseg
0,1,20,R,аби
1,2,21,R,аби
2,3,21,X,-то
3,4,23,R,або
4,5,24,R,або
...,...,...,...,...
16253,15967,6731,F,ти
16254,15968,6732,R,шинок
16255,15969,6733,R,шиноч
16256,15970,6733,S,ок


In [4]:
grouped_shev_words = shev_words.groupby('wrdFK').agg({'mname': list, 'mseg': list}).reset_index()

In [5]:
grouped_shev_words['merged_morphemes'] = grouped_shev_words['mseg'].apply(lambda x: '/'.join(x))

In [6]:
grouped_shev_words.head()

Unnamed: 0,wrdFK,mname,mseg,merged_morphemes
0,20,[R],[аби],аби
1,21,"[R, X]","[аби, -то]",аби/-то
2,23,[R],[або],або
3,24,"[R, R, F]","[або, щ, о]",або/щ/о
4,26,[R],[авраам],авраам


In [7]:
len(grouped_shev_words)

5328

In [8]:
# grouped_shev_words.to_csv('../data/processed/shev4_words.csv', encoding='utf-8', index=False)

## Full db

In [9]:
knu_morphemes_words = pd.read_csv('../data/raw/morfems-knu.txt', delimiter='\t', encoding='utf-8')

In [10]:
knu_morphemes_words

Unnamed: 0,morfemid,wrdFK,mname,mseg
0,706262,2,F,ти
1,706261,2,R,бу
2,706265,4,F,ти
3,706263,4,P,в
4,706264,4,R,ви
...,...,...,...,...
781162,1534158,221586,F,0
781163,1534154,221586,P,до
781164,1534155,221586,R,цiль
781165,1534156,221586,S,н


In [11]:
grouped_knu_morphemes_words = knu_morphemes_words \
    .sort_values(by=['wrdFK', 'morfemid'], ascending=[True, True]) \
    .groupby('wrdFK') \
    .agg({'mname': list, 'mseg': list}) \
    .reset_index()

In [12]:
grouped_knu_morphemes_words.head()

Unnamed: 0,wrdFK,mname,mseg
0,2,"[R, F]","[бу, ти]"
1,4,"[P, R, F]","[в, ви, ти]"
2,5,"[R, F]","[вівц, я]"
3,6,[R],[він]
4,7,[R],[вісь]


In [13]:
grouped_knu_morphemes_words['merged_morphemes'] = grouped_knu_morphemes_words['mseg'].apply(lambda x: '/'.join(x))

In [20]:
grouped_knu_morphemes_words.head(50)

Unnamed: 0,wrdFK,mname,mseg,merged_morphemes
0,2,"[R, F]","[бу, ти]",бу/ти
1,4,"[P, R, F]","[в, ви, ти]",в/ви/ти
2,5,"[R, F]","[вівц, я]",вівц/я
3,6,[R],[він],він
4,7,[R],[вісь],вісь
5,8,"[P, R, F]","[в, ли, ти]",в/ли/ти
6,9,"[P, R, F, X]","[в, ли, ти, ся]",в/ли/ти/ся
7,10,"[R, F]","[вон, а]",вон/а
8,11,"[R, F]","[вон, и]",вон/и
9,12,"[R, F]","[вон, о]",вон/о


In [21]:
len(grouped_knu_morphemes_words)

199890

In [46]:
# grouped_knu_morphemes_words.to_csv('../files/knu_morphemes_words.csv', encoding='utf-8', index=False)

## Merge together

In [22]:
morphemes_words = pd.concat([grouped_shev_words, grouped_knu_morphemes_words], axis=0)

In [23]:
morphemes_words.head()

Unnamed: 0,wrdFK,mname,mseg,merged_morphemes
0,20,[R],[аби],аби
1,21,"[R, X]","[аби, -то]",аби/-то
2,23,[R],[або],або
3,24,"[R, R, F]","[або, щ, о]",або/щ/о
4,26,[R],[авраам],авраам


In [24]:
len(morphemes_words)

205218

In [25]:
morphemes_list = morphemes_words['merged_morphemes'].tolist()

In [26]:
morphemes_list[:10]

['аби',
 'аби/-то',
 'або',
 'або/щ/о',
 'авраам',
 'агар/ян/ськ/ий',
 'адам',
 'а/кафіст',
 'алілуjа',
 'алмаз']

In [27]:
# filter out numeric values
morphemes_list = [s for s in morphemes_list if not s.isdigit()]

In [30]:
with open("../data/processed/merged_morfems_knu.txt", "w", encoding="utf-8") as file:
    for item in morphemes_list:
        file.write(f"{item}\n")