In [1]:
import re
import pandas as pd
import numpy as np

# Silence warnings
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

In [2]:
def read_txt(name):
    
    f = open("txt/" + name + ".txt", "r")
    text = f.read()
    f.close()
    
    return text

In [3]:
Homer_raw = read_txt('odyssey_Homer') + read_txt('iliad_Homer')
Others_raw = read_txt('aristides') + read_txt('hesiode') \
    + read_txt('lycurgus') + read_txt('aristotle') \
    + read_txt('antiphon') + read_txt('thucydides')

In [4]:
df_betunicode = pd.read_csv('lem/betunicode_gr.csv', delimiter = ';')
df_betunicode.head()

Unnamed: 0,match,code,transcript,indice,comment
0,*)\|a,U+1F8A,ᾊ,225190138,GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARI...
1,*(\|a,U+1F8B,ᾋ,225190139,GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARI...
2,*)/|a,U+1F8C,ᾌ,225190140,GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA...
3,*(/|a,U+1F8D,ᾍ,225190141,GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA...
4,*)=|a,U+1F8E,ᾎ,225190142,GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERI...


In [28]:
stopwords = ['a)/llos', 'a)/n', 'a)/ra', 'a)ll', 'a)lla/', 'a)po/', 'au)to/s', 
             "d᾽", 'dai/', 'dai/s', 'de/', 'dh/', 'dia/', 'e(autou=', 'e)/ti', 
             'e)a/n', 'e)gw/', 'e)k', 'e)mo/s', 'e)n', 'e)pi/', 'ei)', 'ei)/mi', 
             'ei)mi/', 'ei)s', 'ga/r', 'ga^', 'ge', 'h(', 'h)/', 'kai/', 'kata/', 
             'me/n', 'meta/', 'mh/', 'o(', 'o(/de', 'o(/s', 'o(/stis', 'o(/ti', 
             'oi(', 'ou(/tws', 'ou(=tos', 'ou)', 'ou)/te', 'ou)=n', 'ou)de/', 
             'ou)dei/s', 'ou)k', 'para/', 'peri/', 'pro/s', 'so/s', 'su/', 'su/n',
             'ta/', 'te', 'th/n', 'th=s', 'th=|', 'ti', 'ti/', 'ti/s', 'tis',
             'to/', 'to/n', 'toi/', 'toiou=tos', 'tou/s', 'tou=', 'tw=n', 'tw=|', 
             'u(mo/s', 'u(pe/r', 'u(po/', 'w(/ste', 'w(s', 'w)=']

In [29]:
stopwords_gr = []

for stopword in stopwords:
    
    new_word = stopword
    
    for ind, row in df_betunicode.iterrows():
        new_word = new_word.replace(row['match'], row['transcript'])
            
    stopwords_gr.append(new_word)
    

In [30]:
stopwords_gr[:5]

['ἄλλοσ', 'ἄν', 'ἄρα', 'ἀλλ', 'ἀλλά']

In [31]:
df_Homer_raw = pd.read_csv('metrics/df_Homer_raw.csv').drop(columns = 'Unnamed: 0')
df_Others_raw = pd.read_csv('metrics/df_Others_raw.csv').drop(columns = 'Unnamed: 0')

In [32]:
df_Homer_raw.head()

Unnamed: 0,word,count,length,weighted_MSE,weighted_MAE,std
0,,41741,0,1742.311081,41.741,0.0
1,καὶ,4803,3,0.038809,0.197,1.890555
2,δ,3672,1,13.483584,3.672,0.0
3,δὲ,3188,2,1.411344,1.188,1.632696
4,δ᾽,2823,2,10.093329,3.177,2.369936


In [33]:
list_H, list_O = [], []

for stopword in stopwords_gr:
    
    n = df_Homer_raw[df_Homer_raw['word'] == stopword].index.values.tolist()
    m = df_Others_raw[df_Others_raw['word'] == stopword].index.values.tolist()
    
    if len(n) > 0:
        list_H.append(n[0])
    
    if len(m) > 0:
        list_O.append(m[0])
        
df_Homer = df_Homer_raw.iloc[list_H].drop(columns = ['length', 'weighted_MAE'])
df_Others = df_Others_raw.iloc[list_O].drop(columns = ['length', 'weighted_MAE'])

In [34]:
df_merge = pd.merge(df_Homer, df_Others, on = 'word', how = 'outer')
ratio = 0.525

In [35]:
df_merge['count_ratio'] = df_merge['count_x'] / (ratio * df_merge['count_y'])

In [37]:
df_merge.sort_values('count_ratio', ascending = False)

Unnamed: 0,word,count_x,weighted_MSE_x,std_x,count_y,weighted_MSE_y,std_y,count_ratio
13,ὅδε,23.0,0.000529,0.0,7.0,1.3e-05,0.348048,6.258503
1,ἄρα,575.0,0.330625,0.4,186.0,0.818329,0.641527,5.888377
11,ἤ,100.0,0.01,0.0,34.0,0.000304,0.0,5.602241
3,δ᾽,2823.0,10.093329,2.369936,3329.0,0.500103,0.62695,1.615243
9,γε,628.0,0.138384,0.79308,927.0,0.225991,0.641784,1.290389
21,τε,2649.0,5.527201,1.644761,4628.0,0.392711,2.870864,1.090258
23,τι,408.0,0.166464,0.769415,1043.0,0.286088,0.394057,0.745103
4,ἔτι,220.0,0.0484,0.0,618.0,0.10044,0.303389,0.678071
16,οἱ,1129.0,8.242641,1.476524,3189.0,2.674483,0.739298,0.674342
6,ἐν,1408.0,0.350464,0.996248,3996.0,0.002424,1.135677,0.671147


In [19]:
eng = read_txt('Iliad_RL')