In [1]:
import bs4 as bs
import pandas as pd
import matplotlib as plt
from matplotlib import pyplot as pplt
import os
from pptx import Presentation
import pprint as p

#Die Buch-Kürzel des alten Testaments
AT = ["Gn", "Ex", "Lv", "Nm", "Dt", "Ios", "Idc", "Rt", "1Sm", "2Sm", "3Rg", "4Rg", "1Par", "2Par", "Esr", "Neh", "Tb",
      "Idt", "Est", "1Mcc"
    , "2Mcc", "3Mcc", "4Mcc", "Iob", "Ps", "Prv", "Ecl", "Ct", "Sap", "Sir", "Is", "Ier", "Lam", "Bar", "Ez", "Dn",
      "Os", "Ioel", "Am", "Abd"
    , "Ion", "Mi", "Na", "Hab", "So", "Agg", "Za", "Mal"]

In [2]:
def percent_calc(num1, num2):
    '''returns percentage value'''
    return (num1/num2)*100
    
 
    
def analyse(filename):
    '''Opens a file specified by the filename, including the directory and creates dataframe for the respective file. Subsequently, 
    this function analyses the chapter with regard to our criteria: 1) Length of the text 2) direct bible quote 3) percentage of text that is a quote'''
    #Öffnen der Datei "filename"
    file_title = filename.split(".")[0]
    with open(filename, 'r') as f:
        file = f.read()   
    soup = bs.BeautifulSoup(file, 'lxml')
    
    #Wählen aller ref-Elemente, die das Attribut cRef haben
    Refs = soup.select('ref[cRef]')
    # Konvertierung der Daten in data. Dies ist die Grundlage für die Auswertungen
    data = []
    for ref in Refs:
        if ref.get('cref').split('_')[0] in AT:
            if ref.parent.find_previous_sibling() == ref.parent.find_previous_sibling('q'):
                data.append((ref.get('cref') , 'at', ref.getText() , ref.parent.find_previous_sibling('q').get_text(' ',strip=True)))
            else:
                data.append((ref.get('cref'), 'at', ref.getText(), 'not a direct quote'))
        else:
            if ref.parent.find_previous_sibling() == ref.parent.find_previous_sibling('q'):
                data.append((ref.get('cref') , 'nt', ref.getText() , ref.parent.find_previous_sibling('q').get_text(' ',strip=True)))
            else:
                data.append((ref.get('cref'), 'nt', ref.getText(), 'not a direct quote'))
    # der eigentliche Dataframe wird gesetzt
    df_data = pd.DataFrame(data)
    # der Header wird gesetzt.
    df_data.columns = ['kuerzel', 'atnt', 'stelle', 'zitat']



    # Hier wird eine Liste der wörtlichen Zitate erstellt
    qs = []
    for ref in Refs:
        if ref.parent.find_previous_sibling() == ref.parent.find_previous_sibling("q"):
            qs.append(ref.parent.find_previous_sibling("q").get_text(" ", strip=True))



    # Hier eine Liste der zitierten Stelle 
    stellen = []
    for i in range (0, len(df_data)):
        if df_data['zitat'][i] != 'not a direct quote':
            stellen.append(df_data['stelle'][i])
    #print(stellen)
    
    # Zählen der Gesamtlänge des zitierten Textes; ohne Leerzeilen und Zeilenumbrüche 
    quote_len = 0
    for i in range(len(qs)):
        quote_len +=len(qs[i].replace(" ","").replace("\n", ""))

    #ges = bs.BeautifulSoup(file, 'lxml')
    ges_soup = soup.select('div')
    print(ges_soup)
    notes = soup.body.select("note")
    supplied = soup.body.select("supplied")
    
    
    # Zählen der Gesamtlänge des Textes; ohne Leerzeilen und Zeilenumbrüche
    total_len=0
    for i in range(len(ges_soup)):
        total_len += len(ges_soup[i].getText().replace(" ", "").replace("\n", ""))
     
   # Berechnung der Länge des Textes aller note-Elemente 
    no_len = 0
    for i in range(len(notes)):
        #print(len(notes[i].getText()))
        no_len += len(notes[i].getText().replace(" ","").replace("\n", ""))
    #print(no_len)
    supplied_len = 0
    for i in range(len(supplied)):
        supplied_len += len(supplied[i].getText().replace(" ", "").replace("\n", ""))
    total = total_len-no_len-supplied_len

    print(f"---{filename}---")
    print(f"Länge inkl. <note>: {total_len}")
    print(f"supplied: {supplied_len}")
    print(f"Länge d. <note>-Inhalte: {no_len}")
    print("Gesamtlänge:" +str(total))
    print(f"Anzahl an erfassten Bibelstellen: {len(Refs)}")
    print(f"wörtliche Zitate: {len(stellen)}")
    #p.pprint(f"Liste d. wörtlichen Zitate:{stellen}")
    print("Zitatlänge :"+ str(quote_len))    
    print("Prozentualer Anteil: " +(str(percent_calc(quote_len, total))))
    lines = [f"Anzahl an erfassten Bibelstellen: {len(Refs)}", f"wörtliche Zitate: {len(stellen)}", f"Liste d. wörtl.: {stellen}",f"Länge inkl. note: {total_len}" ,f"Gesamtlänge: {str(total_len-no_len-supplied_len)}", f"Länge d. erf. <note>-Inhalte: {no_len}", f"Zitatlänge :{str(quote_len)}",f"Prozentualer Anteil: {(str(percent_calc(quote_len, total_len)))}"]
    # Speichern der obigen Ausgabe harter Fakten in einer Text-Datei mit dem Titel der ausgewählten Datei. 
    with open(f'Visualisierungen/{file_title}', 'w') as f2:
        for line in lines:
            f2.write(line)
            f2.write("\n")
    # Ausgabe, Speichern und Schließen der einzelen Diagramme. 
    print(df_data["kuerzel"].value_counts().plot(figsize=(8,8), kind="pie", legend=False, title=f"{filename}"))
    pplt.savefig(f'Visualisierungen/{file_title}.jpg')
    pplt.clf()
    pplt.cla()
    pplt.close()
    print(df_data["atnt"].value_counts().plot(figsize=(8,8), kind="bar", legend=False, title=f"{filename}", color=["blue", "red"]))
    pplt.savefig(f'Visualisierungen/{file_title}_atnt.jpg')
    pplt.clf()
    pplt.cla()
    pplt.close()
    print(df_data["kuerzel"].value_counts()[:5].plot(figsize=(8,8), kind="barh", legend=False, title=f"{filename}  @cRef)", color="purple"))
    pplt.savefig(f'Visualisierungen/{file_title}_cref.jpg')
    pplt.clf()
    pplt.cla()
    pplt.close()
    print(df_data["stelle"].value_counts()[:10].plot(figsize=(8,8), kind="barh", legend=False, title=f"{filename} nach Stellen", color="red"))
    pplt.savefig(f'Visualisierungen/{file_title}_nach_stellen.jpg')
    pplt.clf()
    pplt.cla()
    pplt.close()
    
    

In [3]:
def no_bible_quote(filename):
    '''Opens a file specified by the filename, including the directory and creates dataframe for the respective file. Subsequently, this function analyses the chapter with regard to our criteria'''
    file_title = filename.split(".")[0]
    with open(filename, 'r') as f:
        file = f.read() 
    ges = bs.BeautifulSoup(file, 'lxml')
    ges_soup = ges.select('div')
    print(ges_soup)
    notes = ges.find_all("note")
    supplied = ges.body.select("supplied")
    # Zählen der Gesamtlänge des Textes; ohne Leerzeilen und Zeilenumbrüche
    total_len=0
    for i in range(len(ges_soup)):
        total_len += len(ges_soup[i].getText().replace(" ", "").replace("\n", ""))
     
   # Berechnung der Länge des Textes aller note-Elemente 
       # Berechnung der Länge des Textes aller note-Elemente 
    no_len = 0
    for i in range(len(notes)):
        #print(len(notes[i].getText()))
        no_len += len(notes[i].getText().replace(" ","").replace("\n", ""))
    
    
    supplied_len = 0
    for i in range(len(supplied)):
        supplied_len += len(supplied[i].getText().replace(" ", "").replace("\n", ""))
    print(filename)
    total = total_len - no_len - supplied_len
    print("Gesamtlänge:" +str(total_len-no_len-supplied_len))

# Daten für das gesamte Interim

In [4]:

analyse('interim4.xml')


[<div><pb facs="#facs_6" n="1r" xml:id="img_0006"></pb>
<lb facs="#facs_6_r1l1" n="N001"></lb>Der Römischen
               <lb facs="#facs_6_r1l2" n="N002"></lb><rs ref="#karl_v" type="person">Keyserlichen Majestat</rs>
<lb facs="#facs_6_r1l3" n="N003"></lb>Erklärung, wie es der <w>Reli<pc>-</pc>
<lb></lb>gion</w> halben imm <rs ref="#heiliges_roemisches_reich" type="place">heyligen
               <lb facs="#facs_6_r1l5" n="N005"></lb>Reich</rs> biß zu Außtrag deß
               <lb facs="#facs_6_r1l6" n="N006"></lb>gemeinen Concilii gehalten
               <lb facs="#facs_6_r1l7" n="N007"></lb>werden sol, auff dem <w>Reichß<pc>-</pc>
<lb></lb>tag</w> zu <rs ref="#augsburg" type="place">Augspurg</rs> den XV. Maii im M. D. XLVIII.
               <lb facs="#facs_6_r1l9" n="N009"></lb>Jar publiciert unnd eröffnet unnd von <w>ge<pc>-</pc>
<lb></lb>meinen</w> Stenden angenommen.
            <p facs="#facs_6_TextRegion_1621431804573_61">
<lb facs="#facs_6_r1l11" n="N001"></lb>Christo Auspice

In [5]:
analyse('chapters/vorrede.xml')

ValueError: Length mismatch: Expected axis has 0 elements, new values have 4 elements

In [None]:
with open('interim.xml', 'r') as f:
    file = f.read()   
ges2 = bs.BeautifulSoup(file, 'lxml')
ges_soup2 = ges2.select('div')
print(len(ges_soup2[0]))
print(ges_soup2[0].getText(" ", strip=True))

In [None]:
for i in range(1,27):
    if i <10:
        print(f"analyse('chapters/0{i}.xml')")
    else:
        print(f"analyse('chapters/{i}.xml')")
    i +=1

In [None]:
analyse('chapters/01.xml')
analyse('chapters/02.xml')
analyse('chapters/03.xml')
analyse('chapters/04.xml')
analyse('chapters/05.xml')
analyse('chapters/06.xml')
analyse('chapters/07.xml')
analyse('chapters/08.xml')
analyse('chapters/09.xml')
analyse('chapters/10.xml')
analyse('chapters/11.xml')
analyse('chapters/12.xml')
analyse('chapters/13.xml')
#analyse('chapters/14.xml')
analyse('chapters/15.xml')
analyse('chapters/16.xml')
analyse('chapters/17.xml')
analyse('chapters/18.xml')
analyse('chapters/19.xml')
analyse('chapters/20.xml')
analyse('chapters/21.xml')
analyse('chapters/22.xml')
analyse('chapters/23.xml')
analyse('chapters/24.xml')
#analyse('chapters/25.xml')
analyse('chapters/26.xml')

In [None]:
no_bible_quote('chapters/vorrede.xml')

In [None]:
944

In [None]:
analyse('chapters/05.xml')

In [None]:
for i in range(1,27):
    if i < 10:        
        print(f"no_bible_quote('chapters/0{i}.xml')")
    else:
        print(f"no_bible_quote('chapters/{i}.xml')")

In [None]:
no_bible_quote('chapters/01.xml')
no_bible_quote('chapters/02.xml')
no_bible_quote('chapters/03.xml')
no_bible_quote('chapters/04.xml')
no_bible_quote('chapters/05.xml')
no_bible_quote('chapters/06.xml')
no_bible_quote('chapters/07.xml')
no_bible_quote('chapters/08.xml')
no_bible_quote('chapters/09.xml')
no_bible_quote('chapters/10.xml')
no_bible_quote('chapters/11.xml')
no_bible_quote('chapters/12.xml')
no_bible_quote('chapters/13.xml')
no_bible_quote('chapters/14.xml')
no_bible_quote('chapters/15.xml')
no_bible_quote('chapters/16.xml')
no_bible_quote('chapters/17.xml')
no_bible_quote('chapters/18.xml')
no_bible_quote('chapters/19.xml')
no_bible_quote('chapters/20.xml')
no_bible_quote('chapters/21.xml')
no_bible_quote('chapters/22.xml')
no_bible_quote('chapters/23.xml')
no_bible_quote('chapters/24.xml')
no_bible_quote('chapters/25.xml')
no_bible_quote('chapters/26.xml')

In [None]:
no_bible_quote('chapters/vorrede.xml')

In [None]:
no_bible_quote('interim4.xml')