## Loudness Level Labeling with a Dictionary Approach

In [1]:
# Import 
import os
import pandas as pd
import regex as re
from pathlib import Path
from collections import Counter
import csv

Read the saved csv as a pandas dataframe.

In [2]:
csv_file_path = '/Users/sguhr/Desktop/Diss_notebooks/Diss_data_notebooks_man_anno/se_predicted_subcorpora/20240501_sound_spans_output_Subcorpus_1848-60_predicted_revised1.csv'

# Read the CSV file into a Pandas DataFrame
diss_corpus_annotations = pd.read_csv(csv_file_path)

# Display the DataFrame
print(diss_corpus_annotations.head())


                                                File  \
0  Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...   
1                        Ring_Max_Vom_alten_Heim.xml   
2                                  Ring_Max_Sand.xml   
3              von_Kessel_Karl_Das_Waldbluemchen.xml   
4  Wildermuth_Ottilie_Onkel_Gottliebs_Jugendliebe...   

                                 Ambient Sound Spans  \
0  ['erzitterten', 'Glockenzug mit einer ganzen C...   
1                                                 []   
2  ['Beide stießen zuerst mit den klingenden Röme...   
3  ['Ein Schrei des Schreckens entfuhr dem jungen...   
4  ['das Kindlein,', 'Jubel begrüßt', 'Da erschol...   

                               Character Sound Spans  
0  ['und wurde bei allen Familienfesten gebeten',...  
1  ['Während er sprach', 'mit denen er sich unter...  
2  ['je länger sie mitsammen sprachen', 'wenn das...  
3  ['Auch führten sie fast allein die Unterhaltun...  
4  ['begann die kleine Hedwig bedenklich', 'rief ..

This code reads the CSV file, cleans the string representations of lists in each row, and then extracts the file name, ambient sound spans, and character sound spans. Finally, it prints the extracted data for verification.

In [3]:
import csv

# Define a function to clean the string representation of lists
def clean_list_string(list_string):
    # Remove leading and trailing whitespace
    cleaned = list_string.strip()
    # Remove leading and trailing square brackets
    cleaned = cleaned.strip("[]")
    # Split the string into a list using comma as separator
    cleaned_list = cleaned.split(", ")
    # Remove leading and trailing quotes from each element in the list
    cleaned_list = [element.strip("'\"") for element in cleaned_list]
    return cleaned_list

# Define a function to process each row of the CSV
def process_csv_row(row):
    file_name = row['File']
    ambient_sound_spans = clean_list_string(row['Ambient Sound Spans'])
    character_sound_spans = clean_list_string(row['Character Sound Spans'])
    return file_name, ambient_sound_spans, character_sound_spans

# Read the CSV file and process each row
#csv_file_path = 'your_csv_file.csv'  # Replace 'your_csv_file.csv' with the path to your CSV file
sound_data = []
with open(csv_file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        sound_data.append(process_csv_row(row))

# Print the extracted data for verification
for file_name, ambient_sound_spans, character_sound_spans in sound_data:
    print("File:", file_name)
    print("Ambient Sound Spans:", ambient_sound_spans)
    print("Character Sound Spans:", character_sound_spans)
    print()


File: Wildermuth_Ottilie_Geschichten_aus_Schwaben_Das_erfolgreiche_Conzert.xml
Ambient Sound Spans: ['erzitterten', 'Glockenzug mit einer ganzen Chinesenfamilie', 'Tagelang erschallte die ganze Straße von den schmelzenden Tönen', 'in denen sie sich einübte', 'so wurde doch das Duett glücklich unter rauschendem Applaus zu Ende gesungen', 'Nach einem Solo des Provisors und einem Chor mit Echo', 'vorgetragen vom Liederkranz', 'wobei die Sänger', 'die das Echo vorstellten', 'sich unter das Bett im anstoßenden Schlafkabinet legten', "Elf Uhr schlug's"]
Character Sound Spans: ['und wurde bei allen Familienfesten gebeten', 'Therese schlug das Clavier', 'sang auch mit heller Stimme', 'wie der Papa sie in zärtlichen Stunden nannten zu allen häuslichen Geschäften angehalten', 'Therese sang', 'schlug den Pantalon', 'wenn er auch außer einigen allgemeinen Bemerkungen', 'als', 'dann mit hochklopfendem Herzen Nanettle', 'er unterhielt sich mit keiner der Schwestern', 'bloß mit der Tante und Nanettle

The following code reads the CSV file, processes each row to convert the spans into separate DataFrames for ambient and character sounds, and then concatenates all the DataFrames into a single DataFrame. The resulting DataFrame contains two columns: "annotation_span" and "annotation_class", where each row represents a single span and its corresponding class. Furthermore, it adds the filename of the file the each sound span had been extracted from.

In [4]:
#without lemmatization
import pandas as pd
import csv

# Define a function to convert the list of spans to a DataFrame
def spans_to_dataframe(spans, annotation_class, filename):
    df = pd.DataFrame({'annotation_span': spans, 'annotation_class': annotation_class, 'filename': filename})
    return df

# Define a function to process each row of the CSV
def process_csv_row(row):
    file_name = row['File']
    ambient_sound_spans = clean_list_string(row['Ambient Sound Spans'])
    character_sound_spans = clean_list_string(row['Character Sound Spans'])
    
    # Convert spans to DataFrame
    ambient_df = spans_to_dataframe(ambient_sound_spans, 'ambient_sound', file_name)
    character_df = spans_to_dataframe(character_sound_spans, 'character_sound', file_name)
    
    return ambient_df, character_df

# Read the CSV file and process each row
#csv_file_path = 'your_csv_file.csv'  # Replace 'your_csv_file.csv' with the path to your CSV file
sound_data = []
with open(csv_file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ambient_df, character_df = process_csv_row(row)
        sound_data.append(ambient_df)
        sound_data.append(character_df)

# Concatenate the DataFrame for each row into a single DataFrame
final_df = pd.concat(sound_data, ignore_index=True)

# Print the resulting DataFrame
print(final_df)


                                         annotation_span annotation_class  \
0                                            erzitterten    ambient_sound   
1            Glockenzug mit einer ganzen Chinesenfamilie    ambient_sound   
2      Tagelang erschallte die ganze Straße von den s...    ambient_sound   
3                              in denen sie sich einübte    ambient_sound   
4      so wurde doch das Duett glücklich unter rausch...    ambient_sound   
...                                                  ...              ...   
45587                                     sie abzulehnen  character_sound   
45588                                 wie du gesagt hast  character_sound   
45589                               So hat sie Ja gesagt  character_sound   
45590  und einen stillen Einzug gehalten in das Pfarr...  character_sound   
45591                      Aber ihr Mann wurde gepriesen  character_sound   

                                                filename  
0      Wildermut

The spans_to_dataframe function includes the lemmatized spans in a separate column called "lemmatized_sound_span". Additionally, in the process_csv_row function it passes both the original spans and the lemmatized spans to the spans_to_dataframe function.
The spans_to_dataframe function accepts four arguments: original_spans, lemmatized_spans, annotation_class, and filename. We pass both the original and lemmatized spans to this function in the process_csv_row function.

In [6]:
#import pandas as pd
#import csv
import spacy

# Load the German medium model
nlp = spacy.load('de_core_news_md')

# Define a function to lemmatize the spans
def lemmatize_spans(spans):
    lemmatized_spans = []
    for span in spans:
        doc = nlp(span)
        lemmatized_span = ' '.join([token.lemma_ for token in doc])
        lemmatized_spans.append(lemmatized_span.lower())  # Convert to lowercase
    return lemmatized_spans

# Define a function to convert the list of spans to a DataFrame
def spans_to_dataframe(original_spans, lemmatized_spans, annotation_class, filename):
    df = pd.DataFrame({'filename': filename, 'sound_span': original_spans, 'annotation_class': annotation_class, 'lemmatized_sound_span': lemmatized_spans})
    return df

# Define a function to process each row of the CSV
def process_csv_row(row):
    file_name = row['File']
    ambient_sound_spans = clean_list_string(row['Ambient Sound Spans'])
    character_sound_spans = clean_list_string(row['Character Sound Spans'])
    
    # Lemmatize the spans
    lemmatized_ambient_spans = lemmatize_spans(ambient_sound_spans)
    lemmatized_character_spans = lemmatize_spans(character_sound_spans)
    
    # Convert spans to DataFrame
    ambient_df = spans_to_dataframe(ambient_sound_spans, lemmatized_ambient_spans, 'ambient_sound', file_name)
    character_df = spans_to_dataframe(character_sound_spans, lemmatized_character_spans, 'character_sound', file_name)
    
    return ambient_df, character_df

# Read the CSV file and process each row
#csv_file_path = 'your_csv_file.csv'  # Replace 'your_csv_file.csv' with the path to your CSV file
sound_data = []
with open(csv_file_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ambient_df, character_df = process_csv_row(row)
        sound_data.append(ambient_df)
        sound_data.append(character_df)

# Concatenate the DataFrame for each row into a single DataFrame
final_df = pd.concat(sound_data, ignore_index=True)

# Print the resulting DataFrame
print(final_df)


KeyboardInterrupt: 

The following code defines a function find_sound_words that takes a string of text as input, splits it into words, and then checks if each word is a key in the sound dictionary. If a word is found in the dictionary, it adds it to a list. Finally, it adds a new column to your dataframe called 'found_sound_words', which contains lists of found sound words for each lemmatized sound_span.

In [7]:

loudness_dict = {
'abbringen': 3, 'abgeschiedenheit': 1, 'ablehnen': 3, 'abmachen': 3, 'Abmahnung': 3, 'absprechen': 3, 'aburteilen': 3, 'accorde': 4, 'ächz': 4, 'ächzen': 4, 'ächzend': 4, 'aechzen': 4, 'anbieten': 3, 'anbrüllen': 4, 'andacht': 0, 'andeuten': 3, 'andichten': 3, 'anempfehlen': 3, 'Anempfehlung': 3, 'anfechten': 4, 'angeben': 3, 'anhören': 0, 'anklagen': 3, 'anlächeln': 0, 'anpöbeln': 4, 'anprangern': 4, 'anpreisen': 3, 'anraten': 3, 'anreden': 3, 'anrufen': 4, 'anschreien': 4, 'anschwärzen': 3, 'ansprechen': 3, 'antworteen': 3, 'antworten': 3, 'anweisung': 3, 'applaudieren': 4, 'Applaus': 4, 'Atem': 2, 'atem': 1, 'atemholen': 1, 'atemzug': 1, 'atemzüge': 1, 'atemzügen': 1, 'athem': 1, 'athemholen': 1, 'athemzug': 1, 'athmen': 1, 'athmend': 1, 'athmet': 1, 'atme': 1, 'atmeen': 1, 'atmen': 1, 'atmend': 1, 'atmeter': 1, 'aufathmen': 1, 'aufathmend': 1, 'aufatmen': 1, 'aufatmend': 1, 'auffordern': 3, 'aufhorchen': 1, 'aufruhr': 4, 'aufschluchzen': 2, 'aufschreien': 4, 'aufseufzen': 3, 'aufseufzend': 3, 'aufspiele': 3, 'aufstassen': 4, 'aufstöhnend': 3, 'auftrag': 3, 'auftreten': 3, 'aufwartung': 3, 'aufzuatmen': 1, 'auseinandersetzung': 4, 'auslachen': 4, 'auspacken': 3, 'auspfeifen': 4, 'ausrufen': 4, 'ausschreit': 4, 'äußern': 3, 'aussprechen': 3, 'ausstoßen': 4, 'ausstoßend': 3, 'auszustoßen': 4, 'axthieb': 5, 'bahnhofslärm': 4, 'ballern': 4, 'barsch': 4, 'baßtrompete': 4, 'Baulärm': 5, 'beachten': 3, 'beanstanden': 3, 'beantworten': 3, 'bedauern': 3, 'befrage': 3, 'befriedigen': 3, 'beglückwünschen': 3, 'begrüßen': 3, 'begrüßungsformalität': 3, 'behaupten': 3, 'beherrschen': 3, 'beichten': 3, 'beifall': 4, 'Beifall': 4, 'Beifallsäußerung': 4, 'beipflichten': 3, 'bejahen': 3, 'bejahend': 3, 'bejubeln': 4, 'bekenntniß': 3, 'beklagen': 3, 'beklatschen': 4, 'beklommen': 0, 'bekräftigen': 4, 'belächeln': 1, 'belauschen': 1, 'belehren': 3, 'bellen': 4, 'bellend': 4, 'bemäkeln': 3, 'bemerken': 3, 'beraten': 3, 'bereden': 3, 'berichten': 3, 'berichtigen': 3, 'bersten': 4, 'beruhigen': 3, 'beschämt': 0, 'bescheidener': 3, 'beschimpfen': 4, 'beschimpft': 4, 'beschuldigen': 4, 'Beschuss': 5, 'beschwatzen': 3, 'beschwichtigen': 3, 'beschwören': 3, 'besingen': 4, 'besprechen': 3, 'bestätigen': 3, 'bestimmen': 3, 'bestreiten': 4, 'bestürzt': 3, 'beteuern': 3, 'betonen': 4, 'bewilligen': 3, 'bezichtigen': 4, 'bitter': 4, 'blasen': 4, 'blasend': 4, 'Blasmusik': 4, 'blöken': 4, 'blökend': 4, 'bloßstellen': 3, 'bombe': 5, 'bösartigen': 4, 'Brandung': 4, 'brausen': 4, 'brauste': 4, 'Brise': 2, 'Brüllen': 4, 'brüllen': 4, 'brüllend': 4, 'brülln': 4, 'brüllte': 4, 'brummen': 2, 'brüskieren': 4, 'buchstabierte': 3, 'bumm': 5, 'chor': 4, 'dämpfen': 2, 'dampfpfeife': 5, 'darbracht': 3, 'declamirende': 4, 'denunzieren': 3, 'Detonation': 5, 'diffamieren': 4, 'diskret': 2, 'diskutieren': 4, 'Donner': 5, 'donner': 5, 'Donnerklang': 5, 'donnern': 5, 'donnernd': 5, 'Donnerschlag': 5, 'donnerschlag': 5, 'drängelen': 4, 'drehen': 3, 'Dröhnen': 5, 'dröhnen': 5, 'dröhnend': 5, 'Druckwelle': 5, 'dumpf': 2, 'dumpfen': 1, 'durchbrechen': 3, 'Echo': 3, 'echo': 4, 'echot': 4, 'einatmend': 1, 'eindreangen': 3, 'einflüstern': 2, 'eingeschlafen': 1, 'einholen': 3, 'einladen': 3, 'einlullende': 2, 'einräumen': 3, 'einsam': 0, 'einschlaufen': 1, 'einschlürfend': 2, 'einsilbig': 3, 'Einspruch': 4, 'einstimmen': 3, 'einstürzend': 5, 'eintönig': 3, 'Einwand': 4, 'Einwendung': 4, 'einwilligen': 3, 'Einwilligung': 3, 'Einwurf': 4, 'empfehlen': 3, 'entfuhr': 3, 'entgegenträumend': 0, 'entgegnen': 3, 'entschuldigend': 3, 'erdröhnen': 5, 'erfahren': 3, 'ergänzen': 3, 'erhobener': 4, 'erklären': 3, 'erklingen': 3, 'erlassen': 3, 'erlauben': 3, 'Ermahnung': 4, 'erniedrigen': 4, 'ernst': 3, 'erröten': 1, 'erröthen': 1, 'erschallen': 4, 'erscholl': 4, 'ersticken': 2, 'erstickt': 2, 'ersuchen': 3, 'ertönen': 3, 'erwideren': 3, 'erwidern': 3, 'erwiederen': 3, 'erwiedern': 3, 'erzählen': 3, 'erzählstn': 3, 'erzählung': 3, 'exkommunizieren': 3, 'explodieren': 5, 'Explosion': 5, 'explosion': 5, 'Fanfare': 4, 'fassen': 3, 'fauchen': 4, 'feiern': 4, 'feuern': 5, 'fiepen': 4, 'flatternd': 3, 'flehen': 3, 'flennen': 4, 'fließen': 3, 'floss': 3, 'flöten': 4, 'fluchen': 4, 'fluchn': 4, 'Fluglärm': 5, 'flüsteren': 2, 'Flüstern': 2, 'flüstern': 2, 'flüsternd': 2, 'flüstert': 2, 'flüsterte': 2, 'föppelt': 3, 'forte': 4, 'fortfahren': 3, 'fortfuhr': 3, 'fortissimo': 5, 'fragen': 3, 'fragt': 3, 'freundlicher': 3, 'friede': 0, 'Friedhofsruhe': 0, 'frohlocken': 3, 'frug': 3, 'fügen': 3, 'fußtritt': 3, 'gackern': 4, 'gähnen': 1, 'galoppiern': 4, 'Gardinenpredigt': 4, 'geäußert': 3, 'Gebärde': 1, 'Gebärden': 1, 'Gebärdensprache': 1, 'gebell': 4, 'gebet': 0, 'gebetsglocke': 4, 'Gebrüll': 4, 'gebrüll': 4, 'gedämpft': 2, 'Gedröhne': 5, 'Gedudel': 4, 'geflüstert': 2, 'Gegenrede': 4, 'Gegenstimme': 4, 'geheim': 2, 'geheul': 4, 'Geheul': 4, 'Gehupe': 5, 'Gejammer': 4, 'Gejohle': 4, 'Geklapper': 4, 'geklirr': 4, 'Geknatter': 4, 'gekrache': 5, 'Gelächter': 4, 'Geläute': 4, 'gellen': 4, 'gellend': 4, 'gellt': 4, 'geloben': 3, 'Gemurmel': 2, 'gemurmel': 2, 'genehmigen': 3, 'Geplapper': 4, 'Geplätscher': 4, 'Gepolter': 4, 'gepolter': 5, 'geprassel': 4, 'Gerassel': 4, 'gerassel': 4, 'Geräusch': 3, 'geräusch': 3, 'Geräusche': 3, 'geräuschlos': 1, 'geräuschvoll': 4, 'Gesang': 4, 'Geschnatter': 4, 'Geschrei': 4, 'geschrei': 4, 'geschrieen': 4, 'geschrien': 4, 'gespräch': 3, 'gesprächig': 3, 'gesungene': 4, 'getadelt': 4, 'Getöse': 4, 'getöse': 4, 'Gewitter': 4, 'Gezeter': 4, 'Glockenläuten': 5, 'glockenton': 4, 'glucksen': 2, 'Grabesstille': 0, 'grabesstille': 0, 'granate': 5, 'gratulieren': 3, 'gröhlen': 4, 'grollen': 4, 'grübelen': 0, 'grunzen': 4, 'gruß': 3, 'grüßen': 3, 'gurgeln': 3, 'gurgelnd': 3, 'gurgelton': 2, 'gurren': 3, 'halblaut': 3, 'halbleisen': 2, 'halbleise': 2, 'Hall': 4, 'hall': 4, 'hallen': 4, 'hämmer': 5, 'hämmern': 5, 'hammerschlag': 5, 'Händeklatschen': 4, 'Handschlag': 3, 'hauchen': 1, 'hauen': 4, 'hehehehehemeh': 4, 'heiben': 4, 'heiser': 2, 'heiseren': 2, 'hellhörig': 1, 'herauslach': 4, 'hervorheben': 4, 'hervorrufen': 4, 'Herzschlag': 1, 'herzschlag': 1, 'hetzen': 4, 'heulen': 4, 'heulend': 4, 'heuln': 4, 'heulte': 4, 'hieb': 4, 'hilferuf': 4, 'hinzufügen': 3, 'hinzusetzen': 3, 'ho-ho': 4, 'höhnen': 4, 'höhnisch': 4, 'höhnisches': 4, 'Höllenlärm': 5, 'hörbar': 2, 'horchen': 1, 'Hören': 1, 'hören': 1, 'hörte': 1, 'hufe': 4, 'hufschlag': 4, 'huldigen': 3, 'hülfe': 3, 'hupen': 5, 'hurra': 4, 'huschen': 3, 'hüstelen': 2, 'husten': 3, 'Hymne': 4, 'iah': 3, 'ignorieren': 1, 'intoniert': 4, 'isarrausch': 4, 'jammeren': 3, 'jammern': 3, 'jammernd': 3, 'jauchzen': 4, 'jauchzer': 4, 'jaulen': 4, 'jodeln': 4, 'johlen': 4, 'johlten': 4, 'jubeln': 4, 'kalt': 3, 'karikieren': 3, 'keifen': 4, 'keuchen': 3, 'keuchend': 3, 'kicheren': 3, 'kichern': 3, 'kichernd': 3, 'kikeriki': 4, 'Kinderlärm': 4, 'kläffen': 4, 'klage': 3, 'klageenden': 3, 'klagelaut': 4, 'klagen': 3, 'Klang': 3, 'Klänge': 3, 'klangvoll': 3, 'klappern': 4, 'Klappern': 4, 'klatschen': 4, 'klatschend': 4, 'kleinlaut': 2, 'klicken': 2, 'klingeln': 4, 'klingelzeichen': 3, 'klingen': 4, 'klingend': 3, 'klingender': 3, 'klirre': 4, 'klirren': 4, 'klirrend': 4, 'klopfen': 4, 'klopfte': 4, 'knabenstimm': 3, 'knacken': 2, 'Knall': 5, 'knall': 5, 'knallen': 5, 'knarren': 4, 'knarrend': 4, 'knatteren': 4, 'knattern': 4, 'knirschen': 2, 'knirschend': 2, 'knistern': 2, 'knurren': 4, 'knurrn': 4, 'kollern': 4, 'kommandieren': 4, 'kommandorufe': 4, 'kommandoworte': 4, 'konstatieren': 3, 'Kopfnicken': 1, 'Kopfschütteln': 1, 'Krach': 4, 'krach': 4, 'Krachen': 5, 'krachen': 5, 'krachend': 5, 'krächzen': 4, 'krächzend': 4, 'krähen': 4, 'kratzen': 3, 'kreischen': 4, 'kreischend': 4, 'kreischte': 4, 'kritisieren': 3, 'Kuckucksruf': 3, 'küßen': 2, 'lächelen': 1, 'lächelt': 1, 'Lachen': 4, 'lachen': 4, 'lachend': 4, 'lacht': 4, 'lachten': 4, 'laden': 3, 'lallen': 4, 'langsamer': 3, 'Lärm': 4, 'lärm': 4, 'Lärmbelästigung': 4, 'lärmen': 4, 'lärmend': 4, 'lästern': 3, 'lauschen': 1, 'Laut': 4, 'läuten': 4, 'lautlos': 0, 'Lautlos': 0, 'lautlose': 0, 'lautlosigkeit': 0, 'Leise': 2, 'leiser': 2, 'lesen': 3, 'leseprobe': 3, 'leutseliger': 3, 'liedchen': 4, 'lispelen': 2, 'loben': 3, 'lobpreisen': 3, 'luftschöpfen': 1, 'mahnen': 3, 'mäkeln': 3, 'männerschritte': 3, 'markerschütternd': 5, 'Marschmusik': 4, 'mäuschenstill': 0, 'meckern': 4, 'meinen': 3, 'miau': 3, 'miauen': 3, 'Missklang': 3, 'Misston': 3, 'mißton': 4, 'misstönen': 3, 'misstönend': 3, 'mißtrauen': 3, 'mithören': 1, 'mitleidigen': 3, 'mittagsstille': 0, 'mittheilen': 3, 'monoton': 3, 'mosern': 3, 'Motorenlärm': 5, 'Mucks': 2, 'mucksmäuschenstill': 0, 'murmelen': 2, 'Murmeln': 2, 'murmeln': 2, 'murmelnd': 2, 'murren': 2, 'Musik': 4, 'nachdenken': 0, 'nachdenklich': 0, 'nachdrücklich': 3, 'Nachhall': 3, 'nachhallend': 3, 'Nachklang': 3, 'nachklingen': 3, 'Nachrede': 3, 'nachsinnen': 0, 'nachsprach': 3, 'nachtruhe': 0, 'Nachtruhe': 1, 'nachtstille': 0, 'Nebelhorn': 5, 'Nebengeräusch': 3, 'Nebengeräusche': 3, 'nennen': 3, 'niesen': 4, 'nörgeln': 3, 'Notschrei': 4, 'öd': 0, 'oede': 0, 'offerieren': 3, 'ohrenbetäubend': 5, 'ohrfeige': 4, 'orgeln': 4, 'paddelnd': 3, 'pauken': 4, 'pause': 0, 'pausierte': 0, 'peitschenartig': 4, 'peitschenhieb': 4, 'Pfeifen': 4, 'pfeifen': 4, 'pfeifend': 4, 'Pfeifkonzert': 4, 'pfiffen': 4, 'pianissimo': 2, 'piano': 2, 'Piep': 2, 'piepen': 3, 'piepsen': 2, 'plappern': 3, 'plärren': 4, 'plätschern': 3, 'plätschernd': 3, 'platzen': 4, 'plauderen': 3, 'plaudern': 3, 'plumps': 4, 'Pochen': 4, 'pochen': 1, 'polteren': 4, 'poltern': 4, 'polternd': 4, 'präsentieren': 3, 'prasselen': 3, 'prasseln': 3, 'prasselnd': 3, 'predigen': 3, 'preisen': 3, 'prompt': 3, 'pst!': 3, 'Puff': 3, 'quaken': 4, 'quengeln': 4, 'quieken': 4, 'quietschen': 4, 'Radau': 4, 'rannen': 4, 'rapportieren': 3, 'rascheln': 2, 'raschelnd': 2, 'räsonnieren': 3, 'rassel': 4, 'rasseln': 4, 'rasselnd': 4, 'rasselten': 4, 'rastlos': 3, 'ratschen': 4, 'ratschlag': 3, 'rattern': 4, 'rauh': 3, 'Raunen': 3, 'raunen': 3, 'rauschen': 3, 'rauschend': 3, 'räusperen': 3, 'rechtfertigen': 4, 'rede': 3, 'Reden': 3, 'reden': 3, 'redend': 3, 'redestrom': 3, 'resignieren': 3, 'rieseln': 2, 'rieth': 3, 'Ringgeräusch': 4, 'röcheln': 2, 'röhren': 4, 'rolln': 4, 'ruck': 4, 'ruf': 4, 'Ruf': 4, 'rufen': 4, 'rufenwort': 4, 'rügen': 4, 'ruhe': 1, 'Ruhe': 1, 'Ruhestörung': 4, 'ruhig': 1, 'rumpeln': 4, 'rütteln': 3, 'sagen': 3, 'sagte': 3, 'sagts': 3, 'salutieren': 3, 'salve': 3, 'sanft': 2, 'sang': 4, 'sann': 0, 'satzfragment': 3, 'säuseln': 3, 'sausen': 4, 'sausend': 4, 'schaben': 4, 'schalen': 4, 'Schall': 3, 'schall': 4, 'schalldicht': 1, 'schallen': 4, 'schallend': 4, 'Schallwelle': 4, 'schalt': 4, 'schäumen': 3, 'schäumend': 3, 'scheiden': 3, 'schellen': 4, 'Schelte': 4, 'schelten': 4, 'schepperen': 4, 'scherzen': 3, 'schied': 3, 'schießen': 5, 'schimpfen': 4, 'schimpfirt': 4, 'schimpfwort': 4, 'schlafe': 1, 'schlägerei': 4, 'schlagwort': 3, 'schleichen': 2, 'schleifen': 4, 'schlief': 1, 'schließen': 3, 'schlotternde': 2, 'schluchze': 3, 'schluchzen': 3, 'schluchzte': 3, 'schlucken': 1, 'schlufen': 1, 'schlummer': 1, 'schlummern': 1, 'schlurren': 3, 'schlurrte': 3, 'schmatzen': 3, 'schmettern': 5, 'schmetternd': 5, 'schnalzen': 3, 'schnarchen': 3, 'schnarren': 3, 'schnattern': 3, 'schnauben': 3, 'schnaufen': 2, 'schnauzen': 4, 'schnüffeln': 2, 'schnuppern': 2, 'schnurren': 2, 'scholl': 4, 'schöpfte': 3, 'Schrei': 4, 'schrei': 4, 'schreien': 4, 'schreiend': 4, 'schreit': 4, 'schri': 4, 'schrie': 4, 'schrieen': 4, 'schrien': 4, 'schrill': 5, 'Schritt': 2, 'Schritte': 2, 'Schuss': 5, 'schütten': 3, 'schütternd': 5, 'schwach': 2, 'schwall': 4, 'schwatzen': 3, 'Schweigen': 0, 'schweigen': 0, 'schweigend': 0, 'schweigsam': 0, 'schwellend': 3, 'schwiegen': 0, 'schwirren': 3, 'schwören': 3, 'segnen': 3, 'seufzen': 3, 'seufzer': 3, 'seufzte': 3, 'signal': 5, 'singen': 4, 'singend': 4, 'Sirene': 5, 'Sirenengesänge': 4, 'sirren': 4, 'sonntagsstille': 0, 'sonor': 3, 'sorgenschwer': 3, 'spätnachmittagsstille': 0, 'Spektakel': 4, 'spiel': 4, 'Spott': 4, 'spotten': 4, 'spöttisch': 4, 'spöttischer': 4, 'Sprache': 3, 'sprächen': 3, 'Sprachklang': 3, 'sprachlos': 0, 'Sprachlosigkeit': 0, 'sprechen': 3, 'sprengen': 5, 'Sprengung': 5, 'sprichen': 3, 'stammeln': 3, 'stammelnd': 3, 'stampfen': 4, 'sterbend': 2, 'Stereoton': 3, 'stieß': 4, 'still': 0, 'Stille': 0, 'stille': 0, 'Stillschweigen': 0, 'stillschweigen': 0, 'Stimme': 3, 'stimmen': 3, 'Stimmengewirr': 4, 'stimmlos': 2, 'stimmung': 3, 'stocken': 0, 'stockend': 0, 'stöhnen': 4, 'stöhnend': 4, 'stöhnte': 4, 'stoßen': 4, 'stotteren': 3, 'Strafpredigt': 4, 'Strafrede': 4, 'Straßenlärm': 4, 'streichelen': 2, 'streiten': 4, 'stumm': 0, 'Stummheit': 1, 'sturm': 4, 'Sturm': 4, 'sturmgeläute': 4, 'stürmisch': 4, 'Sturzbach': 4, 'stürzen': 4, 'summen': 2, 'summn': 2, 'surren': 2, 'sympathisierend': 3, 'Tadel': 4, 'tadelen': 3, 'tadeln': 4, 'taktlos': 3, 'taktmäßig': 3, 'Tamtam': 4, 'tanzweisen': 4, 'täppischer': 3, 'tätschelen': 2, 'tauschen': 3, 'theilen': 3, 'ticken': 2, 'Tierlaut': 3, 'Tierlaute': 3, 'Tierstimme': 3, 'toast': 4, 'toben': 4, 'tobend': 4, 'todesruhe': 0, 'todtenstill': 0, 'todtenstille': 0, 'tone': 3, 'Töne': 4, 'töne': 4, 'tönen': 4, 'tosen': 4, 'tost': 4, 'totenstill': 0, 'Totenstille': 0, 'totenstille': 0, 'trampeln': 4, 'tremolieren': 4, 'trillern': 4, 'Trommeln': 4, 'Trommelschlag': 4, 'Trommelwirbel': 4, 'trommelwirbel': 5, 'trompet': 4, 'trompeten': 4, 'trompetend': 4, 'Trompetenstoß': 4, 'tröpfeln': 2, 'trubel': 4, 'trutzliedl': 4, 'Tumult': 4, 'tumult': 4, 'tumultuös': 4, 'tuschelen': 2, 'tuscheln': 2, 'überreden': 3, 'übertönen': 5, 'überzeugen': 3, 'umschlich': 2, 'umstimmen': 3, 'umwerben': 3, 'unartikuliert': 2, 'unerhört': 4, 'unterbrechen': 3, 'unterhalten': 3, 'Unterhaltungsmusik': 4, 'unterrichten': 3, 'urteilen': 3, 'verabschieden': 3, 'verfluchen': 4, 'verhallen': 2, 'verhalten': 0, 'verhandeln': 3, 'verharren': 0, 'verhauchen': 1, 'verhöhnen': 4, 'verkehren': 3, 'Verkehrslärm': 4, 'verkündet': 4, 'verkündigen': 4, 'verlachen': 4, 'vernehmen': 0, 'verneinen': 3, 'verschweigen': 0, 'versetzen': 3, 'versichern': 3, 'verspotten': 4, 'versprechen': 3, 'verständigen': 3, 'verstummen': 0, 'versunken': 0, 'verteidigen': 4, 'verunglimpfen': 4, 'verurteilen': 3, 'verwundertem': 3, 'verwünschung': 4, 'verzeihung': 3, 'verzerrt': 3, 'vielstimmig': 4, 'vorgesagen': 3, 'vorhalten': 4, 'vorlesen': 3, 'vorschlagen': 3, 'vorsprechen': 3, 'Vorstellung': 3, 'Vortrag': 4, 'vortragen': 4, 'vorwerfen': 4, 'Vorwurf': 4, 'Vorwürfe': 4, 'vorwurfsvoll': 4, 'waldesfrieden': 0, 'wau': 4, 'wauwau': 4, 'weinen': 3, 'weinend': 3, 'weinte': 3, 'weisen': 3, 'Wellenschlag': 4, 'wettern': 4, 'wetzen': 4, 'Widerhall': 3, 'widerhallen': 3, 'Widerlegung': 3, 'Widerrede': 4, 'widersprechen': 3, 'Widerspruch': 4, 'widerwillig': 3, 'wiederholen': 3, 'wiederholt': 3, 'wiederholte': 3, 'wiehern': 4, 'wiehernd': 4, 'willkommen': 3, 'wimmeren': 2, 'wimmern': 2, 'wimmert': 2, 'Windgeräusche': 3, 'windstoß': 4, 'winseln': 3, 'wirbeln': 3, 'wisperen': 2, 'wispern': 2, 'Wohlklang': 3, 'wollen': 3, 'worgeln': 4, 'Wort': 3, 'Worte': 3, 'wortlos': 0, 'wünschten': 3, 'würdigen': 3, 'wütend': 4, 'zanken': 4, 'zauderen': 0, 'Zeichensprache': 1, 'zerbersten': 4, 'zerbrechen': 4, 'zerknäulten': 3, 'zerplatzen': 4, 'zerspringen': 4, 'zerstreuen': 3, 'zetern': 4, 'zeterte': 4, 'zirpen': 3, 'zischelen': 2, 'zischeln': 2, 'Zischen': 2, 'zischen': 2, 'zitteren': 3, 'zittern': 1, 'zögeren': 0, 'zögern': 0, 'zögernd': 3, 'zubilligen': 3, 'zuflüsteren': 2, 'zuflüstern': 2, 'zugeben': 3, 'Zugeständnis': 3, 'zugestehen': 3, 'zuhören': 1, 'zujubeln': 4, 'zuprosten': 4, 'zureden': 3, 'zurufen': 4, 'Zusage': 3, 'zusagen': 3, 'zusammenstauchen': 4, 'zusammentrommeln': 4, 'zusichern': 3, 'zustimmen': 3, 'zustimmend': 3, 'zuzurufen': 4, 'Zwischenruf': 4, 'zwitschern': 3
}

# Function to find words in a text that are keys in the sound dictionary
def find_sound_words(text):
    sound_words = []
    for word in text.split():
        if word in loudness_dict:
            sound_words.append(word)
    return sound_words

# Add a column with the list of found sound words for each lemmatized sound_span
final_df['found_sound_words'] = final_df['lemmatized_sound_span'].apply(find_sound_words)


KeyError: 'lemmatized_sound_span'

In [14]:
final_df[:10]

Unnamed: 0,filename,sound_span,annotation_class,lemmatized_sound_span,found_sound_words
0,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,erzitterten,ambient_sound,erzittert,[]
1,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,Glockenzug mit einer ganzen Chinesenfamilie,ambient_sound,glockenzug mit ein ganz chinesenfamilie,[]
2,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,Tagelang erschallte die ganze Straße von den s...,ambient_sound,tagelang erschallen der ganz straße von der sc...,[erschallen]
3,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,in denen sie sich einübte,ambient_sound,in der sie sich einüben,[]
4,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,so wurde doch das Duett glücklich unter rausch...,ambient_sound,so werden doch der duett glücklich unter rausc...,"[rauschend, singen]"
5,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,Nach einem Solo des Provisors und einem Chor m...,ambient_sound,nach ein solo der provisor und ein chor mit echo,"[chor, echo]"
6,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,vorgetragen vom Liederkranz,ambient_sound,vortragen von liederkranz,[vortragen]
7,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,wobei die Sänger,ambient_sound,wobei der sänger,[]
8,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,die das Echo vorstellten,ambient_sound,der der echo vorstellen,[echo]
9,Wildermuth_Ottilie_Geschichten_aus_Schwaben_Da...,sich unter das Bett im anstoßenden Schlafkabin...,ambient_sound,sich unter der bett in anstoßend schlafkabinet...,[]


In [15]:
# Count empty and non-empty lists in the 'found_sound_words' column
empty_list_count = final_df['found_sound_words'].apply(lambda x: len(x) == 0).sum()
non_empty_list_count = final_df['found_sound_words'].apply(lambda x: len(x) > 0).sum()

print("Number of empty lists in 'found_sound_words' column:", empty_list_count)
print("Number of non-empty lists in 'found_sound_words' column:", non_empty_list_count)


Number of empty lists in 'found_sound_words' column: 14874


In [ ]:
The following code matches the found sound words with their loudness levels providing a list of values in the new column "listed_loudness_values".

In [ ]:
# Define a function to map sound words to their loudness levels
def map_to_loudness(sound_words):
    return [sound_dict[word] for word in sound_words if word in sound_dict]

# Apply the function to the 'found_sound_words' column and create the new column 'listed_loudness_values'
final_df['listed_loudness_values'] = final_df['found_sound_words'].apply(map_to_loudness)

# Print the DataFrame with the new column
print(final_df)

In [ ]:
# Define a function to calculate the average loudness value
def calculate_average_loudness(listed_loudness_values):
    if not listed_loudness_values:
        return None
    numeric_values = [value for value in listed_loudness_values if pd.notna(value)]
    if not numeric_values:
        return None
    # Round the average loudness value to one decimal place
    return round(sum(numeric_values) / len(numeric_values), 1)

# Apply the function to the 'listed_loudness_values' column and create the new column 'average_loudness_value'
final_df['average_loudness_value'] = final_df['listed_loudness_values'].apply(calculate_average_loudness)

# Print the DataFrame with the new column
print(final_df)


In [ ]:
# Filter out rows where "average_loudness_value" is NaN
filtered_df = final_df.dropna(subset=['average_loudness_value'])

# Extracting "sound_span" and "average_loudness_value" columns as a dictionary
sound_loudness_dict = filtered_df.set_index('sound_span')['average_loudness_value'].to_dict()

# Print the dictionary
print(sound_loudness_dict)



To add the defined average loudness value as a loudness attribute to the XML element surrounding the sound event span in the XML file, you'll need to parse the XML file, locate the relevant element, and add the attribute with the calculated average_loudness_value as its value.
The following code will update the XML file with the calculated loudness attribute value for the relevant XML element. Make sure to run this code for each XML file in your corpus folder and replace the loudness value with the calculated average loudness value for each file. 
In the following code:
The regex pattern now has two capturing groups: one for the opening tag of the XML element (<(?:ambient|character)_sound>\s*) and one for the content between the opening tag and the closing tag ({re.escape(sound_span)}\s*<).
The replacement string uses the first capturing group (\1) to preserve the opening tag, extends it with the loudness attribute, and uses the second capturing group (\2) to preserve the content of the xml element closed by the < beginning of the closing element.

In [ ]:
#The following finally works, even if I have to add another iteration over the corpus to subsequently delete the still present closing > of the xml opening element tag from the first regex group. This was a painful thing.


import os
import re

def process_xml_files(xml_folder, sound_loudness_dict):
    # Iterate over each XML file in the folder
    for filename in os.listdir(xml_folder):
        if filename.endswith('.xml'):
            xml_file_path = os.path.join(xml_folder, filename)
            # Read the XML file
            with open(xml_file_path, 'r', encoding='utf-8') as file:
                xml_content = file.read()

            # Iterate over keys in the sound_loudness_dict
            for sound_span, loudness_value in sound_loudness_dict.items():
                # Define the regex pattern with two capturing groups
                regex_pattern = fr'(<(?:ambient|character)_sound>\s*)({re.escape(sound_span)}\s*<)'
                
                # Define the replacement string with both capturing groups and loudness attribute
                replacement_string = fr'\1 loudness="{loudness_value}">\2'

                # Find and replace the regex match with the extended XML element
                xml_content = re.sub(regex_pattern, replacement_string, xml_content)

            # Write the modified XML content back to the file
            with open(xml_file_path, 'w', encoding='utf-8') as file:
                file.write(xml_content)

# Folder path containing XML files
folder_path = '/Users/sguhr/Desktop/Diss_notebooks/ner_prediction_sicherheitskopie_20240505_15h/20240501_Subcorpus_1848-55_predicted_for_loudness/revised'

# Call the function to process XML files in the folder with the sound_loudness_dict
process_xml_files(folder_path, sound_loudness_dict)



This code replaces with a simple reg ex the > of the opening xml element that received the attribute extension via the regex group solution above. 

The replace_regex_pattern_in_folder function takes two arguments: input_folder (the path to the input folder containing text files) and output_folder (the path to the output folder where modified files will be saved).
Inside the function, the regex patterns regex_pattern1 and regex_pattern2 are defined.
The function iterates over each file in the input folder using os.listdir and checks if the file ends with the .txt extension.
For each text file found, it reads the content, performs the replacement using re.sub, and writes the modified content to a new file in the output folder.

In [ ]:
# This code replaces with a simple reg ex the > of the opening xml element that received the attribute extension via the regex group solution above. 
# Hopefully I will find a better and direct way soon.

#import os
#import re

def postprocess_xml_files_after_regex(input_folder, output_folder):
    # Define the regex patterns
    regex_pattern1 = r'sound> loudness='
    regex_pattern2 = r'sound loudness='

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.xml'):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Read the input file
            with open(input_file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            # Perform the replacement
            modified_text = re.sub(regex_pattern1, regex_pattern2, text)

            # Write the modified text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(modified_text)

# Input and output folder paths
input_folder_path = '/Users/sguhr/Desktop/Diss_notebooks/ner_prediction_sicherheitskopie_20240505_15h/20240501_Subcorpus_1848-55_predicted_for_loudness/revised'
output_folder_path = '/Users/sguhr/Desktop/Diss_notebooks/ner_prediction_sicherheitskopie_20240505_15h/20240501_Subcorpus_1848-55_predicted_for_loudness/revised'

# Call the function to perform the replacement for each file in the input folder
postprocess_xml_files_after_regex(input_folder_path, output_folder_path)
