### A projektben használt szövegkorpusz

Források
- https://verskorpusz.elte-dh.hu/
- https://github.com/ELTE-DH/poetry-corpus

Választott szerzők
- Ady Endre
- Batbits Mihály
- Karinthy Frigyes
- Kosztolányi Dezső
- Tóth Árpád

### 1. Modul
Adatbázis előállítása.

In [1]:
%%capture
%pip install pandas_read_xml
%pip install --upgrade pip
!pip3 install upgrade-pip
%pip install pandas
%pip install numpy
%pip install lxml

In [2]:
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET
import logging
import sys
from pathlib import Path

In [3]:
print(f'Python: {sys.version}')
print(f'pandas: {pd.__version__}')
print(f'numpy: {np.__version__}')

Python: 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]
pandas: 2.2.1
numpy: 1.26.4


In [4]:
class FolderPathError(Exception):
    def __init__(self, message = "Egyedi hiba történt"):
        self.message = message
        super().__init__(self.message)

In [5]:
def get_correct_numerical_data(df, column_indices): # Egyes beolvasott adatok jellegükből adódóan csak pozitív számok lehetnek
    
    for column_index in column_indices:
        column = df.iloc[:, column_index]
        
        for row_index, value in enumerate(column):
            if isinstance(value, bool) or (not (isinstance(value, (int, np.int64, float, np.float64)) and value > 0)):
                df.iloc[row_index, column_index] = np.nan

    return df

In [6]:
def get_stats(df, column_index): # Bővített leíró statisztika
    
     stat_list = [
          df.iloc[:, column_index].mean(),
          df.iloc[:, column_index].median(),
          df.iloc[:, column_index].quantile(0.25),
          df.iloc[:, column_index].quantile(0.75),
          df.iloc[:, column_index].min(),
          df.iloc[:, column_index].max()
     ]              
     stat_list.extend(
          [
               stat_list[5] - stat_list[4], # Terjedelem
               (df.iloc[:, column_index] - stat_list[0]).abs().mean(), # Az átlagtól való átlagos abszolút eltérés (MAD - Mean absolute deviation)
               (df.iloc[:, column_index] - stat_list[1]).abs().median(), # A mediántól való abszolút eltérés mediánja (MAD - Median absolute deviation)
               df.iloc[:, column_index].std(), # Standrad szórás
               df.iloc[:, column_index].std() / abs(stat_list[0]) # Relatív standard szórás
          ]
     )

     stat_list = [round(item, 3) for item in stat_list]

     return stat_list          

In [7]:
def count_item(df, column_index, my_list):

    result = []

    for item in my_list:
        result.append((df.iloc[:, column_index] == item).sum())

    return result

In [8]:
def get_rates_simple(my_list, divisor):
    
    rates = []
    
    for item in my_list:
        rates.append(round(item / divisor, 3))
    
    return rates

In [9]:
def get_rates_complex(my_list, my_dict, divisor): # A függvény egy listákat tartalmazó listát vár be
    
    rates = []

    for item in my_list:
        rates.append(round(sum([my_dict[i] for i in item]) / divisor, 3))       
    
    return rates

In [10]:
def get_rates_char(df, column_index, char_list, divisor):
    
    rates = []
    
    for char in char_list:    
        rates.append(round(df.iloc[:, column_index].str.count(char).sum() / divisor, 3))
    
    return rates

A vers szerzője

In [11]:
def get_author(poem_path):
    
    try:
    
        df_author = pd.read_xml(poem_path,xpath = "//ns:titleStmt/ns:author/ns:persName", namespaces = {"ns":"http://www.tei-c.org/ns/1.0"})

        author = df_author.iloc[0, 0] + ' ' + df_author.iloc[0, 1]

        author_dict = {'author': author}

        return author_dict
    
    except Exception as e:
        
        author_dict = {'author': np.nan}
        logging.warning(f'{poem_path}: a szerző neve nem kiolvasható: {str(e)}')
        logging.info(f'{poem_path}: ez a vers nem kerül az adatbázisba')
        print(str(e))

        return author_dict

A vers alapadatai

In [12]:
def get_features_div(poem_path):

    div_keylist1 = ['title', 'nWord_title']
    div_keylist2 = ['nStanza', 'nLine', 'nWord', 'nSyll']
    div_keylist3 = ['rate_shortS']

    try:

        div_column_indices_list = [1, 2, 3, 4]
        df_div = get_correct_numerical_data(pd.read_xml(poem_path, xpath = "//div"), div_column_indices_list)        

        div_dict = {}

        special_characters = ['!', ':', ';', '"', '-', '.', '?', ',', '(', ')', '[', ']', '»', '«', '`']

        if isinstance(df_div.iloc[0, 9], (str, int, np.int64)):
            div_dict[div_keylist1[0]] = str(df_div.iloc[0, 9])  
            div_dict[div_keylist1[1]] = len((''.join(char for char in div_dict['title'] if char not in special_characters)).split())          
        else:
            div_dict[div_keylist1[0]] = np.nan
            div_dict[div_keylist1[1]] = np.nan

        div_list = []

        for i in range(1, 5):
            if isinstance(df_div.iloc[0, i],(int, np.int64)) and df_div.iloc[0, i] > 0:
                div_list.append(df_div.iloc[0, i])
            else:
                div_list.append(np.nan)

        div_dict.update(dict(zip(div_keylist2, div_list)))
        div_dict[div_keylist3[0]] = round(df_div.iloc[0, 5] / div_dict['nSyll'], 3)

        return div_dict

    except Exception as e:

        div_dict = dict.fromkeys(div_keylist1 + div_keylist2 + div_keylist3, np.nan)
        logging.warning(f'{poem_path}: A vers alapadatait nem lehet kiolvasni.')
        print(str(e))

        return div_dict

A versszakok jellemzői

In [13]:
def get_features_lg(poem_path):
    
    lg_keylist1 = ['mean_nLine_lg', 'med_nLine_lg', 'q1_nLine_lg', 'q3_nLine_lg', 'min_nLine_lg', 'max_nLine_lg', 'range_nLine_lg', 'MADmean_nLine_lg', 'MADmed_nLine_lg', 'std_nLine_lg', 'rstd_nLine_lg']
    lg_keylist2 = ['mean_nWord_lg', 'med_nWord_lg', 'q1_nWord_lg', 'q3_nWord_lg', 'min_nWord_lg', 'max_nWord_lg', 'range_nWord_lg', 'MADmean_nWord_lg', 'MADmed_nWord_lg', 'std_nWord_lg', 'rstd_nWord_lg']
    lg_keylist3 = ['mean_nSyll_lg', 'med_nSyll_lg', 'q1_nSyll_lg', 'q3_nSyll_lg', 'min_nSyll_lg', 'max_nSyll_lg', 'range_nSyll_lg', 'MADmean_nSyll_lg', 'MADmed_nSyll_lg', 'std_nSyll_lg', 'rstd_nSyll_lg']
    lg_keylist4 = ['mean_nShortS_lg', 'med_nShortS_lg', 'q1_nShortS_lg', 'q3_nShortS_lg', 'min_nShortS_lg', 'max_nShortS_lg', 'range_nShortS_lg', 'MADmean_nShortS_lg', 'MADmed_nShortS_lg', 'std_nShortS_lg', 'rstd_nShortS_lg']
    lg_keylist5 = ['mean_nLongS_lg', 'med_nLongS_lg', 'q1_nLongS_lg', 'q3_nLongS_lg', 'min_nLongS_lg', 'max_nLongS_lg', 'range_nLongS_lg', 'MADmean_nLongS_lg', 'MADmed_nLongS_lg', 'std_nLongS_lg', 'rstd_nLongS_lg']
    lg_keylist6 = ['rhyme_abcb', 'rhyme_aba', 'rhyme_abcdb', 'rhyme_abca', 'rhyme_abb', 'rhyme_aaaa']

    try:

        lg_column_indices_list = [1, 2, 3, 4, 5]
        df_lg = get_correct_numerical_data(pd.read_xml(poem_path, xpath = "//lg"), lg_column_indices_list)

        lg_dict = {}
        
        keylist = [lg_keylist1, lg_keylist2, lg_keylist3, lg_keylist4, lg_keylist5]

        for i in range(1, 6):
            lg_dict.update(dict(zip(keylist[i - 1], get_stats(df_lg, i))))

        rhymelist = ['abcb', 'aba', 'abcdb', 'abca', 'abb', 'aaaa']
        rhyme_countlist = count_item(df_lg, 6, rhymelist)
        lg_dict.update(dict(zip(lg_keylist6, rhyme_countlist)))

        return lg_dict

    except Exception as e:
        lg_dict = dict.fromkeys(lg_keylist1 + lg_keylist2 + lg_keylist3 + lg_keylist4 + lg_keylist5 + lg_keylist6, np.nan)
        logging.info(f'{poem_path}: A versszakok jellemzőit nem lehet kiolvasni.')
        print(str(e))

        return lg_dict

A verssorok jellemzői

In [14]:
def get_features_l(poem_path):
    
    l_keylist1 = ['mean_nWord_l', 'med_nWord_l', 'q1_nWord_l', 'q3_nWord_l', 'min_nWord_l', 'max_nWord_l', 'range_nWord_l', 'MADmean_nWord_l', 'MADmed_nWord_l', 'std_nWord_l', 'rstd_nWord_l']
    l_keylist2 = ['mean_nSyll_l', 'med_nSyll_l', 'q1_nSyll_l', 'q3_nSyll_l', 'min_nSyll_l', 'max_nSyll_l', 'range_nSyll_l', 'MADmean_nSyll_l', 'MADmed_nSyll_l', 'std_nSyll_l', 'rstd_nSyll_l']
    l_keylist3 = ['mean_nShortS_l', 'med_nShortS_l', 'q1_nShortS_l', 'q3_nShortS_l', 'min_nShortS_l', 'max_nShortS_l', 'range_nShortS_l', 'MADmean_nShortS_l', 'MADmed_nShortS_l', 'std_nShortS_l', 'rstd_nShortS_l']
    l_keylist4 = ['mean_nLongS_l', 'med_nLongS_l', 'q1_nLongS_l', 'q3_nLongS_l', 'min_nLongS_l', 'max_nLongS_l', 'range_nLongS_l', 'MADmean_nLongS_l', 'MADmed_nLongS_l', 'std_nLongS_l', 'rstd_nLongS_l']

    try:

        l_column_indices_list = [1, 2, 3, 4]
        df_l = get_correct_numerical_data(pd.read_xml(poem_path,xpath="//l"), l_column_indices_list)

        l_dict={}
        
        keylist = [l_keylist1, l_keylist2, l_keylist3, l_keylist4]

        for i in range(1, 5):
            l_dict.update(dict(zip(keylist[i - 1],get_stats(df_l, i))))

        return l_dict

    except Exception as e:
        
        l_dict=dict.fromkeys(l_keylist1 + l_keylist2 + l_keylist3 + l_keylist4, np.nan)
        logging.info(f'{poem_path}: A versszakonkénti jellemzőket nem lehet kiolvasni.')
        print(str(e))

        return l_dict

A szavak jellemzői

In [15]:
def get_features_w(poem_path):
    
    w_keylist1 = ['PROPN', 'NOUN', 'ADJ', 'NUM', 'PRON', 'VERB', 'ADV', 'CONJ', 'SCONJ', 'DET', 'ADP', 'INTJ', 'PART']
    w_keylist2 = ['rate_PROPN_NOUN', 'rate_ADJ', 'rate_NOUNs', 'rate_VERB', 'rate_ADV']
    w_keylist3 = ['rate_phontypeLow', 'rate_phontypeHigh', 'rate_phontypeMixed']
    w_keylist4 = ['mean_lenWord', 'med_lenWord', 'q1_lenWord', 'q3_lenWord', 'min_lenWord', 'max_lenWord', 'range_lenWord', 'MADmean_lenWord', 'MADmed_lenWord', 'std_lenWord', 'rstd_lenWord']
    w_keylist5 = ['sum_lenWord']
    w_keylist6 = ['rate_consonant', 'rate_fFV', 'rate_FFV', 'rate_bFV', 'rate_BFV']    
    
    try:

        div_column_indices_list = [3]
        df_div = get_correct_numerical_data(pd.read_xml(poem_path,xpath="//div"), div_column_indices_list)
        nWord = df_div.iloc[0, 3]

        df_w = pd.read_xml(poem_path,xpath="//w")
        
        w_dict = {}

        nouns_countlist = count_item(df_w, 2, w_keylist1)
        w_dict.update(dict(zip(w_keylist1, nouns_countlist)))

        nounslist = [['PROPN', 'NOUN'],['ADJ'],['PROPN', 'NOUN', 'ADJ', 'NUM', 'PRON'],['VERB'],['ADV']]
        w_dict.update(dict(zip(w_keylist2, get_rates_complex(nounslist, w_dict, nWord))))

        vowellist=['low', 'high', 'mixed']
        vowel_countlist = count_item(df_w, 5, vowellist)
        w_dict.update(dict(zip(w_keylist3, get_rates_simple(vowel_countlist, nWord))))

        df_w['lenWord'] = df_w.iloc[:, 6].str.len()
        w_dict.update(dict(zip(w_keylist4, get_stats(df_w, 8))))

        w_dict[w_keylist5[0]] = df_w.iloc[:, 8].sum()
        charlist=['c', 'f', 'F', 'b', 'B']
        w_dict.update(dict(zip(w_keylist6, get_rates_char(df_w, 6, charlist, w_dict['sum_lenWord']))))

        return w_dict
    
    except Exception as e:

        w_dict=dict.fromkeys(w_keylist1 + w_keylist2 + w_keylist3 + w_keylist4 + w_keylist5 + w_keylist6, np.nan)
        logging.info(f'{poem_path}: A szavak jellemzőit nem lehet kiolvasni.')
        print(str(e))

        return w_dict

Központozás a versben

In [16]:
def get_features_pc(poem_path):
    
    try:
        
        df_pc = pd.read_xml(poem_path,xpath="//pc")
        
        PUNCT = (df_pc.iloc[:, 1] == 'PUNCT').sum()

        PUNCT_dict = {'PUNCT': PUNCT}

        return PUNCT_dict
    
    except Exception as e:

        PUNCT_dict = {'PUNCT': np.nan}
        logging.info(f'{poem_path}: Az írásjelekről nincs adat vagy a szerző nem használt központozást.')
        print(str(e))

        return PUNCT_dict

Rímpárok a versben

In [17]:
def get_features_rhymePair(poem_path, root):
    
    try:

        df_rhymePair = pd.read_xml(poem_path, xpath="//rhymePair")

        nRhymepair = df_rhymePair.iloc[:, 2].count()

        rhymePair_dict = {'nRhymepair': nRhymepair}

        return rhymePair_dict
    
    except ValueError as e:

        rhymePair_dict = {'nRhymepair': np.nan}
        
        if root.find(".//rhymePairs") is not None: 
            rhymePair_dict['nRhymepair'] = 0
        else:                 
            logging.info(f'{poem_path}: nem tartalmaz információt a rímpárokról.')
            print(str(e))
            
        return rhymePair_dict
    
    except Exception as e:

        rhymePair_dict={'nRhymepair': np.nan}
        logging.info(f'{poem_path} A rímpárokról jellemzőjét nem lehet kiolvasni.')
        print(str(e))

        return rhymePair_dict

Alliterációk a versben

In [18]:
def get_features_alliteration(poem_path, root):
    
    alliteration_keylist1 = ['nAll', 'rate_clearAll']
    alliteration_keylist2 = ['mean_lenAll', 'med_lenAll', 'q1_lenAll', 'q3_lenAll', 'min_lenAll', 'max_lenAll', 'range_lenAll', 'MADmean_lenAll', 'MADmed_lenAll', 'std_lenAll', 'rstd_lenAll']
    
    try:
        
        df_alliteration = pd.read_xml(poem_path,xpath = "//alliteration")

        alliteration_dict = {}          

        alliteration_dict[alliteration_keylist1[0]] = df_alliteration.iloc[:, 1].count()

        df_alliteration['lenAll'] = df_alliteration.iloc[:, 1].apply(len)
        alliteration_dict[alliteration_keylist1[1]] = round((df_alliteration.iloc[:, 1].str.count('a') == df_alliteration.iloc[:, 6]).sum() / alliteration_dict['nAll'], 3)

        alliteration_dict.update(dict(zip(alliteration_keylist2, get_stats(df_alliteration, 6))))

        return alliteration_dict
    
    except ValueError as e: 
                           
          alliteration_dict = dict.fromkeys(alliteration_keylist1 + alliteration_keylist2, np.nan)
          
          if root.find(".//alliterations") is not None: 
               alliteration_dict['nAll'] = 0
          else:                 
               logging.info(f'{poem_path}: Nem tartalmaz információt az alliterációkról.')
               print(str(e))
               
          return alliteration_dict
    
    except Exception as e:

          alliteration_dict = dict.fromkeys(alliteration_keylist1 + alliteration_keylist2, np.nan)
          logging.info(f'{poem_path}: Az alliterációk jellemzőit nem lehet kiolvasni.')
          print(str(e))

          return alliteration_dict

Főprogram

In [19]:
logging.basicConfig(filename = 'infoEvent.log', level = logging.INFO, format = '%(asctime)s - %(levelname)s - %(message)s', encoding = 'utf-8')

current_dir = Path.cwd()

parent_dir = current_dir.parent

poems_path = parent_dir / "Versek"

df_main = pd.DataFrame()

try:

    if os.path.exists(poems_path) and os.path.isdir(poems_path):

        row_index = 0

        for rootfolder, folders, files in os.walk(poems_path):
            for file in files:
                if file.endswith('.xml'):  # Csak az XML kiterjesztésű fájlokat vesszük figyelembe

                    complete_path = os.path.join(rootfolder, file)
                    tree = ET.parse(complete_path)
                    root = tree.getroot()

                    functionlist1 = [get_author, get_features_div, get_features_lg, get_features_l, get_features_w, get_features_pc]
                    functionlist2 = [get_features_rhymePair, get_features_alliteration]

                    main_dict = {}

                    for function in functionlist1:
                        main_dict.update(function(complete_path))

                    for function in functionlist2:
                        main_dict.update(function(complete_path,root)) 

                    if row_index == 0:                        

                        column_names = list(main_dict.keys())
                        label_list = ['class_label', 'nan_count_label']
                        column_names.extend(label_list)
                        df_main = pd.DataFrame(columns = column_names)                   

                    if pd.notna(main_dict['author']):

                        for item in main_dict:    

                            df_main.loc[row_index,item] = main_dict[item]

                        if df_main.iloc[row_index, 0] == 'Ady Endre':
                            df_main.loc[row_index, 'class_label'] = 1
                        else:
                            df_main.loc[row_index, 'class_label'] = 0   

                        df_main.loc[row_index, 'nan_count_label'] = df_main.iloc[row_index].isna().sum() - 1 

                        row_index += 1

                    else:
                        logging.info(f'{file}: A fájlban hiányzik a szerző neve.')                                       

                else:
                    logging.info(f'{file}: A fájl nem XML kiterjesztésű.')

    else:
        raise FolderPathError(f'{poems_path}: Az elérési útvonal nem található vagy nem egy mappa.')

except FolderPathError as e:

    logging.error(str(e))
    print(str(e))

except Exception as e:

    logging.error(f'{file}: Hiba történt.')
    file.close()
    print(str(e))

xpath does not return any nodes or attributes. Be sure to specify in `xpath` the parent nodes of children and attributes to parse. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.
xpath does not return any nodes or attributes. Be sure to specify in `xpath` the parent nodes of children and attributes to parse. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.
xpath does not return any nodes or attributes. Be sure to specify in `xpath` the parent nodes of children and attributes to parse. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.


In [20]:
df_main.to_csv('poems_data.csv', index = False)
df_main
# df_main.loc[1113:1117, 'nWord_title':]


Unnamed: 0,author,title,nWord_title,nStanza,nLine,nWord,nSyll,rate_shortS,mean_nLine_lg,med_nLine_lg,...,q3_lenAll,min_lenAll,max_lenAll,range_lenAll,MADmean_lenAll,MADmed_lenAll,std_lenAll,rstd_lenAll,class_label,nan_count_label
0,Ady Endre,GÓG ÉS MAGÓG FIA VAGYOK ÉN...,6,4,16,82,157,0.503,4.0,4.0,...,3.0,2,3,1,0.48,0.0,0.548,0.211,1,0
1,Ady Endre,A MI GYERMEKÜNK,3,5,25,108,210,0.448,5.0,5.0,...,3.0,2,4,2,0.667,0.5,0.816,0.306,1,0
2,Ady Endre,A VÁR FEHÉR ASSZONYA,4,4,16,77,136,0.529,4.0,4.0,...,3.75,3,4,1,0.5,0.5,0.707,0.202,1,0
3,Ady Endre,MERT ENGEM SZERETSZ,3,1,15,43,75,0.453,15.0,15.0,...,3.0,3,3,0,0.0,0.0,0.0,0.0,1,10
4,Ady Endre,A KÖNNYEK ASSZONYA [1],4,3,24,97,180,0.45,8.0,8.0,...,2.0,2,4,2,0.446,0.0,0.647,0.285,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2793,Tóth Árpád,FEBRUÁRIUS,1,3,12,70,144,0.382,4.0,4.0,...,3.0,2,3,1,0.444,0.0,0.577,0.217,0,0
2794,Tóth Árpád,MÁRCIUS [2],2,3,12,76,144,0.389,4.0,4.0,...,2.25,2,3,1,0.375,0.0,0.5,0.222,0,0
2795,Tóth Árpád,ÁPRILIS [2],2,3,12,68,144,0.465,4.0,4.0,...,3.0,2,4,2,0.562,0.5,0.707,0.257,0,0
2796,Tóth Árpád,MÁJUS,1,3,12,66,144,0.438,4.0,4.0,...,2.75,2,3,1,0.444,0.0,0.516,0.221,0,0
