In [78]:
%%capture
%pip install pandas_read_xml
%pip install --upgrade pip
!pip3 install upgrade-pip

In [79]:
import pandas as pd
import os
import xml.etree.ElementTree as ET
import numpy as np
import logging
import sys
#print(sys.version)

In [80]:
def get_stats(df,column_index):
    
    try:
        stat_list=[
             df.iloc[:,column_index].mean(),
             df.iloc[:,column_index].median(),
             df.iloc[:,column_index].quantile(0.25),
             df.iloc[:,column_index].quantile(0.75),
             df.iloc[:,column_index].min(),
             df.iloc[:,column_index].max()
        ]              
        stat_list.extend(
             [
                  stat_list[5]-stat_list[4],
                  (df.iloc[:,column_index]-stat_list[0]).abs().mean(),
                  (df.iloc[:,column_index]-stat_list[1]).abs().mean(),
                  df.iloc[:,column_index].std(),
                  df.iloc[:,column_index].std()/abs(stat_list[0])
             ]
        )

        stat_list=[round(item,3) for item in stat_list]

        return stat_list          
    
    except ValueError as e:
        logging.error(str(e))
        sys.exit(1)

In [81]:
def count_item(df,column_index,my_list):

    result=[]

    for item in my_list:
        result.append((df.iloc[:,column_index]==item).sum())

    return result

In [82]:
def get_rates_simple(my_list,divisor):
    
    rates=[]
    
    for item in my_list:
        rates.append(round(item/divisor,3))
    
    return rates

In [83]:
def get_rates_complex(my_list,my_dict,divisor):
    
    rates=[]

    for item in my_list:
        rates.append(round(sum([my_dict[i] for i in item])/divisor,3))       
    
    return rates

In [84]:
def get_rates_char(df,column_index,char_list,divisor):
    
    rates=[]
    
    for char in char_list:    
        rates.append(round(df.iloc[:,column_index].str.count(char).sum()/divisor,3))
    
    return rates

In [85]:
def get_author(poem_path):
    
    try:
    
        df_author=pd.read_xml(poem_path,xpath="//ns:titleStmt/ns:author/ns:persName",namespaces={"ns":"http://www.tei-c.org/ns/1.0"})

        author=df_author.iloc[0, 0]+' '+df_author.iloc[0, 1]

        author_dict={'author':author}

        return author_dict
    
    except Exception as e:
        author_dict={'author':np.nan}
        return author_dict

In [86]:
def get_features_div(poem_path):
    
    try:

        div_dict={}

        div_keylist1=['title','nWord_title']
        div_keylist2=['nStanza','nLine','nWord','nSyll']
        div_keylist3=['rate_shortS']

        df_div=pd.read_xml(poem_path,xpath="//div")

        special_characters=['!',':',';','"','-','.','?',',','(',')','[',']','»','«','`']

        div_dict[div_keylist1[0]]=df_div.iloc[0,9]
        div_dict[div_keylist1[1]]=len(div_dict['title'].replace(f"[{''.join(special_characters)}]",'').split())

        div_list=[]

        for i in range(1,5):
            div_list.append(df_div.iloc[0,i])

        div_dict.update(dict(zip(div_keylist2,div_list)))
        div_dict[div_keylist3[0]]=round(df_div.iloc[0, 5]/div_dict['nSyll'],3)

        return div_dict

    except Exception as e:
        div_dict=dict.fromkeys(div_keylist1+div_keylist2+div_keylist3,np.nan)
        return div_dict

In [87]:
def get_features_lg(poem_path):
    
    lg_keylist1=['mean_nLine_lg','med_nLine_lg','q1_nLine_lg','q3_nLine_lg','min_nLine_lg','max_nLine_lg','range_nLine_lg','MADmean_nLine_lg','MADmed_nLine_lg','dev_nLine_lg','v_nLine_lg']
    lg_keylist2=['mean_nWord_lg','med_nWord_lg','q1_nWord_lg','q3_nWord_lg','min_nWord_lg','max_nWord_lg','range_nWord_lg','MADmean_nWord_lg','MADmed_nWord_lg','dev_nWord_lg','v_nWord_lg']
    lg_keylist3=['mean_nSyll_lg','med_nSyll_lg','q1_nSyll_lg','q3_nSyll_lg','min_nSyll_lg','max_nSyll_lg','range_nSyll_lg','MADmean_nSyll_lg','MADmed_nSyll_lg','dev_nSyll_lg','v_nSyll_lg']
    lg_keylist4=['mean_nShortS_lg','med_nShortS_lg','q1_nShortS_lg','q3_nShortS_lg','min_nShortS_lg','max_nShortS_lg','range_nShortS_lg','MADmean_nShortS_lg','MADmed_nShortS_lg','dev_nShortS_lg','v_nShortS_lg']
    lg_keylist5=['mean_nLongS_lg','med_nLongS_lg','q1_nLongS_lg','q3_nLongS_lg','min_nLongS_lg','max_nLongS_lg','range_nLongS_lg','MADmean_nLongS_lg','MADmed_nLongS_lg','dev_nLongS_lg','v_nLongS_lg']
    lg_keylist6=['rhyme_abcb','rhyme_aba','rhyme_abcdb','rhyme_abca','rhyme_abb','rhyme_aaaa']
    
    keylist=[lg_keylist1,lg_keylist2,lg_keylist3,lg_keylist4,lg_keylist5]

    try:

        lg_dict={}

        df_lg=pd.read_xml(poem_path,xpath="//lg")
        
        for i in range(1,6):
            lg_dict.update(dict(zip(keylist[i-1],get_stats(df_lg,i))))

        rhymelist=['abcb','aba','abcdb','abca','abb','aaaa']
        rhyme_countlist=count_item(df_lg,6,rhymelist)
        lg_dict.update(dict(zip(lg_keylist6,rhyme_countlist)))

        return lg_dict

    except Exception as e:
        lg_dict=dict.fromkeys(lg_keylist1+lg_keylist2+lg_keylist3+lg_keylist4+lg_keylist5+lg_keylist6,np.nan)
        return lg_dict

In [88]:
def get_features_l(poem_path):
    
    l_keylist1=['mean_nWord_l','med_nWord_l','q1_nWord_l','q3_nWord_l','min_nWord_l','max_nWord_l','range_nWord_l','MADmean_nWord_l','MADmed_nWord_l','dev_nWord_l','v_nWord_l']
    l_keylist2=['mean_nSyll_l','med_nSyll_l','q1_nSyll_l','q3_nSyll_l','min_nSyll_l','max_nSyll_l','range_nSyll_l','MADmean_nSyll_l','MADmed_nSyll_l','dev_nSyll_l','v_nSyll_l']
    l_keylist3=['mean_nShortS_l','med_nShortS_l','q1_nShortS_l','q3_nShortS_l','min_nShortS_l','max_nShortS_l','range_nShortS_l','MADmean_nShortS_l','MADmed_nShortS_l','dev_nShortS_l','v_nShortS_l']
    l_keylist4=['mean_nLongS_l','med_nLongS_l','q1_nLongS_l','q3_nLongS_l','min_nLongS_l','max_nLongS_l','range_nLongS_l','MADmean_nLongS_l','MADmed_nLongS_l','dev_nLongS_l','v_nLongS_l']
    
    keylist=[l_keylist1,l_keylist2,l_keylist3,l_keylist4]

    try:
    
        l_dict={}

        df_l=pd.read_xml(poem_path,xpath="//l")
        
        for i in range(1,5):
            l_dict.update(dict(zip(keylist[i-1],get_stats(df_l,i))))

        return l_dict

    except Exception as e:
        l_dict=dict.fromkeys(l_keylist1+l_keylist2+l_keylist3+l_keylist4,np.nan)
        return l_dict

In [89]:
def get_features_w(poem_path):
    
    w_keylist1=['PROPN','NOUN','ADJ','NUM','PRON','VERB','ADV','CONJ','SCONJ','DET','ADP','INTJ','PART']
    w_keylist2=['rate_PROPN_NOUN','rate_ADJ','rate_NOUNs','rate_VERB','rate_ADV']
    w_keylist3=['rate_phontypeLow','rate_phontypeHigh','rate_phontypeMixed']
    w_keylist4=['mean_lenWord','med_lenWord','q1_lenWord','q3_lenWord','min_lenWord','max_lenWord','range_lenWord','MADmean_lenWord','MADmed_lenWord','dev_lenWord','v_lenWord']
    w_keylist5=['sum_lenWord']
    w_keylist6=['rate_consonant','rate_fFV','rate_FFV','rate_bFV','rate_BFV']    
    
    try:

        w_dict={}

        df_div=pd.read_xml(poem_path,xpath="//div")
        nWord=df_div.iloc[0,3]

        df_w=pd.read_xml(poem_path,xpath="//w")
        
        nouns_countlist=count_item(df_w,2,w_keylist1)
        w_dict.update(dict(zip(w_keylist1,nouns_countlist)))

        nounslist=[['PROPN','NOUN'],['ADJ'],['PROPN','NOUN','ADJ','NUM','PRON'],['VERB'],['ADV']]
        w_dict.update(dict(zip(w_keylist2,get_rates_complex(nounslist,w_dict,nWord))))

        vowellist=['low','high','mixed']
        vowel_countlist=count_item(df_w,5,vowellist)
        w_dict.update(dict(zip(w_keylist3,get_rates_simple(vowel_countlist,nWord))))

        df_w['lenWord']=df_w.iloc[:, 6].str.len()
        w_dict.update(dict(zip(w_keylist4,get_stats(df_w,8))))

        w_dict[w_keylist5[0]]=df_w.iloc[:,8].sum()
        charlist=['c','f','F','b','B']
        w_dict.update(dict(zip(w_keylist6,get_rates_char(df_w,6,charlist,w_dict['sum_lenWord']))))

        return w_dict
    
    except Exception as e:
        w_dict=dict.fromkeys(w_keylist1+w_keylist2+w_keylist3+w_keylist4+w_keylist5+w_keylist6,np.nan)
        return w_dict

In [90]:
def get_features_pc(poem_path):
    
    try:
        
        df_pc=pd.read_xml(poem_path,xpath="//pc")
        
        PUNCT=(df_pc.iloc[:,1]=='PUNCT').sum()

        PUNCT_dict={'PUNCT':PUNCT}

        return PUNCT_dict
    
    except Exception as e:
        PUNCT_dict={'PUNCT':np.nan}
        return PUNCT_dict

In [91]:
def get_features_rhymePair(poem_path):
    
    try:

        df_rhymePair=pd.read_xml(poem_path,xpath="//rhymePair")

        nRhymepair=df_rhymePair.iloc[:, 2].count()

        rhymePair_dict={'nRhymepair':nRhymepair}

        return rhymePair_dict
    
    except Exception as e:
        rhymePair_dict={'nRhymepair':np.nan}
        return rhymePair_dict

In [92]:
def get_features_alliteration(poem_path):
    
    alliteration_keylist1=['nAll','rate_clearAll']
    alliteration_keylist2=['mean_lenAll','med_lenAll','q1_lenAll','q3_lenAll','min_lenAll','max_lenAll','range_lenAll','MADmean_lenAll','MADmed_lenAll','dev_lenAll','v_lenAll']
    
    try:
        
        alliteration_dict={}

        df_alliteration=pd.read_xml(poem_path,xpath="//alliteration")

        alliteration_dict[alliteration_keylist1[0]]=df_alliteration.iloc[:,1].count()

        df_alliteration['lenAll']=df_alliteration.iloc[:,1].apply(len)
        alliteration_dict[alliteration_keylist1[1]]=round((df_alliteration.iloc[:,1].str.count('a')==df_alliteration.iloc[:,6]).sum()/alliteration_dict['nAll'],3)

        alliteration_dict.update(dict(zip(alliteration_keylist2,get_stats(df_alliteration,6))))

        return alliteration_dict
    
    except Exception as e:
        alliteration_dict=dict.fromkeys(alliteration_keylist1+alliteration_keylist2,np.nan)
        return alliteration_dict

In [93]:
logging.basicConfig(filename='infoEvent.log',level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s',encoding='utf-8')

poems_path="D:\Iskolai\Tananyagok\Szakdolgozat\Bsc\Versek"

df_main=pd.DataFrame()

try:

    if os.path.exists(poems_path) and os.path.isdir(poems_path):

        raw_index=0

        for rootfolder, folders, files in os.walk(poems_path):
            for file in files:
                if file.endswith('.xml'):  # Csak az XML kiterjesztésű fájlokat vesszük figyelembe
                    complete_path = os.path.join(rootfolder,file)

                    functionlist=[get_author,get_features_div,get_features_lg,get_features_l,get_features_w,get_features_pc,get_features_rhymePair,get_features_alliteration]
                    
                    main_dict={}
                    
                    for function in functionlist:
                        main_dict.update(function(complete_path))    
                    
                    if raw_index==0:                        

                        column_names=list(main_dict.keys())
                        df_main=pd.DataFrame(columns=column_names)                   

                    for item in main_dict:    

                        df_main.loc[raw_index,item]=main_dict[item]

                    raw_index+=1

                else:
                    logging.info(f'Hiba a(z) {file} fájl olvasásakor.')

    else:
        raise FileNotFoundError(f'Az elérési útvonal {poems_path} nem található vagy nem egy mappa.')

except FileNotFoundError as e:
    logging.error(str(e))

except Exception as e:
    logging.error(f'Hiba történt: {str(e)}')

In [107]:
df_main
# df_main.loc[0:20,'rate_fFV':'v_lenAll']

Unnamed: 0,author,title,nWord_title,nStanza,nLine,nWord,nSyll,rate_shortS,mean_nLine_lg,med_nLine_lg,...,med_lenAll,q1_lenAll,q3_lenAll,min_lenAll,max_lenAll,range_lenAll,MADmean_lenAll,MADmed_lenAll,dev_lenAll,v_lenAll
0,Ady Endre,GÓG ÉS MAGÓG FIA VAGYOK ÉN...,6,4,16,82,157,0.503,4.0,4.0,...,3.0,2.0,3.0,2,3,1,0.48,0.4,0.548,0.211
1,Ady Endre,A MI GYERMEKÜNK,3,5,25,108,210,0.448,5.0,5.0,...,2.5,2.0,3.0,2,4,2,0.667,0.667,0.816,0.306
2,Ady Endre,A VÁR FEHÉR ASSZONYA,4,4,16,77,136,0.529,4.0,4.0,...,3.5,3.25,3.75,3,4,1,0.5,0.5,0.707,0.202
3,Ady Endre,MERT ENGEM SZERETSZ,3,1,15,43,75,0.453,15.0,15.0,...,3.0,3.0,3.0,3,3,0,0.0,0.0,0.0,0.0
4,Ady Endre,A KÖNNYEK ASSZONYA [1],4,3,24,97,180,0.45,8.0,8.0,...,2.0,2.0,2.0,2,4,2,0.446,0.273,0.647,0.285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,Ady Endre,UJJAK A SZAJNÁBAN,3,3,15,62,132,0.455,5.0,5.0,...,2.5,2.25,2.75,2,3,1,0.5,0.5,0.707,0.283
1112,Ady Endre,"ÁLDOTT, FALUSI KÖD",3,5,20,92,180,0.433,4.0,4.0,...,3.0,3.0,3.0,2,6,4,0.776,0.571,1.254,0.382
1113,Ady Endre,AZ ISTEN-KERESŐ LÁRMA,3,2,12,44,86,0.477,6.0,6.0,...,,,,,,,,,,
1114,Ady Endre,KI LÁTOTT ENGEM?,3,11,22,118,242,0.471,2.0,2.0,...,3.0,2.0,3.0,2,3,1,0.494,0.444,0.527,0.206
