Notebook for 1.) Case Study

In [1]:
import numpy as np
import pandas as pd
from IPython.display import Audio

In [2]:
# Path to the .npz file
DATASET_PATH = "../../MLPC2025_dataset"

ANNOTATIONS_PATH = DATASET_PATH + "/annotations.csv"
ANNOTATIONS_TEXT_EMBEDDINGS_PATH = DATASET_PATH + "/annotations_text_embeddings.npz"

metadata_path = DATASET_PATH + "/metadata.csv"
METADATA_TITLE_EMBEDDINGS_PATH = DATASET_PATH + "/metadata_title_embeddings.npz"
METADATA_KEYWORDS_EMBEDDINGS_PATH = DATASET_PATH + "/metadata_keywords_embeddings.npz"

AUDIO_PATHS = DATASET_PATH + "/audio"
AUDIO_FEATURES_PATHS = DATASET_PATH + "/audio_features"

In [3]:
annotations = pd.read_csv(ANNOTATIONS_PATH)
annotations.head()

Unnamed: 0,task_id,filename,annotator,text,onset,offset,filename_unsafe
0,161977861,560761.mp3,5945971035380930099053858595454362549806990533...,bird twitters nearby with a high pitch two times,16.552368,17.271435,560761_tufted titmouse calling.mp3
1,161987165,240538.mp3,4916274343929406460752597829190197300566135449...,A cow bell clinging repeatedly,12.265866,15.420303,240538_AMB - Cowbell stable Stereowav.mp3
2,161989998,660337.mp3,8960534646813062318784592632173589349806817458...,A child speaks with a high-pitched voice,12.493984,13.31568,660337_Aeroplane Takeoff From Cabinwav.mp3
3,161978339,725686.mp3,4540509031938851510799116530500792274849113758...,A high pitch meowing coming from a cat,16.018221,16.991704,725686_Black Cat Talking.mp3
4,161985729,97606.mp3,1085174475307080254018414698413953957857995126...,An extremely loud brushing sound on a construc...,0.0,1.626352,97606_jackhammer med distwav.mp3


In [4]:
# Remove unnecessary columns for this task
annotations = annotations.drop(columns=['task_id', 'onset', 'offset', 'filename_unsafe'])
annotations['original_index'] = annotations.index
annotations.head()

Unnamed: 0,filename,annotator,text,original_index
0,560761.mp3,5945971035380930099053858595454362549806990533...,bird twitters nearby with a high pitch two times,0
1,240538.mp3,4916274343929406460752597829190197300566135449...,A cow bell clinging repeatedly,1
2,660337.mp3,8960534646813062318784592632173589349806817458...,A child speaks with a high-pitched voice,2
3,725686.mp3,4540509031938851510799116530500792274849113758...,A high pitch meowing coming from a cat,3
4,97606.mp3,1085174475307080254018414698413953957857995126...,An extremely loud brushing sound on a construc...,4


In [5]:
metadata = pd.read_csv(metadata_path)
metadata.head()

Unnamed: 0,filename,keywords,freesound_id,sound_link,manufacturer,license,title,description,num_downloads,geotag,start_time_s,end_time_s
0,617030.mp3,"atmosphere, general-noise, suburban, ambiance,...",617030,https://freesound.org/people/klankbeeld/sounds...,klankbeeld,https://creativecommons.org/licenses/by/4.0/,suburb night NL 1041PM 210415_0279.wav,Night in a suburban city. All sounds far. Tra...,65,51.7090740261 5.30657821347,15.6,43.535
1,637408.mp3,"jackhammer, construction, reflected",637408,https://freesound.org/people/kyles/sounds/637408/,kyles,http://creativecommons.org/publicdomain/zero/1.0/,construction jackhammer reflected5.flac,construction jackhammer reflected5\n\nrecorded...,23,,209.8,232.356
2,615545.mp3,"river-side, field-recording, bell, fields, vil...",615545,https://freesound.org/people/klankbeeld/sounds...,klankbeeld,https://creativecommons.org/licenses/by/4.0/,4 church-bells 7AM at river NL 210718_0304.wav,Four differed church bells tell it is 7 o’cloc...,227,51.7405127581 5.23133654974,6.0,31.666
3,410867.mp3,"sports-crowd, applause, crowd, crowd-cheer, Ad...",410867,https://freesound.org/people/NobodyYouKnowOf/s...,NobodyYouKnowOf,http://creativecommons.org/publicdomain/zero/1.0/,crowd_the_hill_language.wav,Crowd making some noise at a SANFL Aussie rule...,308,-34.9157073897 138.596134186,19.8,48.569
4,65916.mp3,"bones, breaking, broken, chime, clatter, explo...",65916,https://freesound.org/people/BristolStories/so...,BristolStories,http://creativecommons.org/licenses/by-nc/3.0/,shells-tinkle.WAV,This is a shells wind chime. I used it for the...,7721,,23.6,40.797


In [6]:
# Remove unnecessary columns for this task
metadata = metadata.drop(columns=['freesound_id', 'sound_link', 'manufacturer', 'license', 'num_downloads', 'geotag', 'start_time_s', 'end_time_s'])
metadata.head()

Unnamed: 0,filename,keywords,title,description
0,617030.mp3,"atmosphere, general-noise, suburban, ambiance,...",suburb night NL 1041PM 210415_0279.wav,Night in a suburban city. All sounds far. Tra...
1,637408.mp3,"jackhammer, construction, reflected",construction jackhammer reflected5.flac,construction jackhammer reflected5\n\nrecorded...
2,615545.mp3,"river-side, field-recording, bell, fields, vil...",4 church-bells 7AM at river NL 210718_0304.wav,Four differed church bells tell it is 7 o’cloc...
3,410867.mp3,"sports-crowd, applause, crowd, crowd-cheer, Ad...",crowd_the_hill_language.wav,Crowd making some noise at a SANFL Aussie rule...
4,65916.mp3,"bones, breaking, broken, chime, clatter, explo...",shells-tinkle.WAV,This is a shells wind chime. I used it for the...


In [7]:
metadata_title_embeddings = np.load(METADATA_TITLE_EMBEDDINGS_PATH)['embeddings']
metadata_keywords_embeddings = np.load(METADATA_KEYWORDS_EMBEDDINGS_PATH)['embeddings']

annotations_text_embeddings = np.load(ANNOTATIONS_TEXT_EMBEDDINGS_PATH)['embeddings']

EMBEDDING_SIZE = metadata_title_embeddings.shape[1]

In [8]:
# Get list of filenames of files with multiple annotators
files_with_multiple_annotators = annotations.groupby('filename')['annotator'].nunique()
files_with_multiple_annotators = files_with_multiple_annotators[files_with_multiple_annotators > 1]

files_per_annotator_count = [(num_annotators, num_files) for num_annotators, num_files in files_with_multiple_annotators.value_counts().items()]

files_with_multiple_annotators = files_with_multiple_annotators.index.tolist()
print(f"Number of files with multiple annotators: {len(files_with_multiple_annotators)}\n")

for num_annotators, num_files in files_per_annotator_count:
    print(f"Number of files with {num_annotators} annotators: {num_files}")

Number of files with multiple annotators: 731

Number of files with 2 annotators: 725
Number of files with 3 annotators: 6


In [9]:
# Get annotations for these files and group by filename and annotator
annotations_for_files_with_multiple_annotators = annotations[annotations['filename'].isin(files_with_multiple_annotators)]
annotations_for_files_with_multiple_annotators = annotations_for_files_with_multiple_annotators.groupby(['filename', 'annotator']).agg(lambda x: list(x)).reset_index()

print(f"Total annotations for all files with multiple annotators: {annotations_for_files_with_multiple_annotators.apply(len).sum()}")

annotations_for_files_with_multiple_annotators.head()

Total annotations for all files with multiple annotators: 5872


Unnamed: 0,filename,annotator,text,original_index
0,102431.mp3,2825101440002704998553785231562509406931203443...,"[Baby crying, repeatedly, natural, indoors, ne...",[15463]
1,102431.mp3,7505829110384075687331616965056441784304296723...,[Baby making mid-pitched unrhythmic non-crying...,"[7568, 12072, 16641, 17965, 18846, 19487, 2683..."
2,102744.mp3,1145579747015607221221744067969991550764671773...,[Military person speaking clearly and distinct...,"[1679, 5193, 17562, 23710, 31903, 32066, 32482..."
3,102744.mp3,9467928724851080650561981608771911260861986192...,[Calm mature male voice telling coordinates an...,"[3022, 4952]"
4,106035.mp3,3816674505249688706999437810769297529658534646...,[Segments of a sound made by an electric guit...,[32103]


In [10]:
comparison_dict = {}

for filename in annotations_for_files_with_multiple_annotators['filename'].unique():
    # per file
    comparison_dict[filename] = {
        'metadata_title_embedding': None,
        'metadata_keywords_embedding': None,
        'annotation_embeddings': {}
    }

    # Get metadata embeddings
    metadata_idx = metadata[metadata['filename'] == filename].index[0]

    comparison_dict[filename]['metadata_title_embedding'] = metadata_title_embeddings[metadata_idx]
    comparison_dict[filename]['metadata_keywords_embedding'] = metadata_keywords_embeddings[metadata_idx]

    # Get annotation embeddings
    file_annotations = annotations_for_files_with_multiple_annotators[annotations_for_files_with_multiple_annotators['filename'] == filename]

    for annotator in file_annotations['annotator'].unique():
        annotation_indices = file_annotations[file_annotations['annotator'] == annotator]['original_index'].values[0]

        # Get the text embeddings for the annotations
        annotation_embeddings = np.zeros((len(annotation_indices), EMBEDDING_SIZE))
        for i, idx in enumerate(annotation_indices):
            annotation_embeddings[i] = annotations_text_embeddings[idx]

        # Average the embeddings for the annotator
        annotation_embeddings = np.mean(annotation_embeddings, axis=0)

        comparison_dict[filename]['annotation_embeddings'][annotator] = annotation_embeddings

In [11]:
print("Check if everything works correctly:\n")
for filename, embeddings in comparison_dict.items():
    print(f"Filename: {filename}")
    print(f"Metadata title embedding: {embeddings['metadata_title_embedding'].shape}")
    print(f"Metadata keywords embedding: {embeddings['metadata_keywords_embedding'].shape}")
    for annotator, embedding in embeddings['annotation_embeddings'].items():
        print(f"Annotator: {annotator}, Embedding shape: {embedding.shape}")
    print()
    break


Check if everything works correctly:

Filename: 102431.mp3
Metadata title embedding: (1024,)
Metadata keywords embedding: (1024,)
Annotator: 28251014400027049985537852315625094069312034433417461412837429504269879097216, Embedding shape: (1024,)
Annotator: 75058291103840756873316169650564417843042967235442422525023433266794403941324, Embedding shape: (1024,)



In [12]:
# Find files for which the multiple annotations are the most and the least similar
files_ordered_by_annotation_sim = []

for filename, embeddings in comparison_dict.items():

    annotation_embeddings = embeddings['annotation_embeddings']
    annotators = list(annotation_embeddings.keys())

    # Compute pairwise similarities
    for i in range(len(annotators)):
        for j in range(i + 1, len(annotators)):
            annotator_i = annotators[i]
            annotator_j = annotators[j]

            embedding_i = annotation_embeddings[annotator_i]
            embedding_j = annotation_embeddings[annotator_j]

            cos_sim = np.dot(embedding_i, embedding_j) / (np.linalg.norm(embedding_i) * np.linalg.norm(embedding_j))

            files_ordered_by_annotation_sim.append((filename, cos_sim))

files_ordered_by_annotation_sim.sort(key=lambda x: x[1])

print("Check if everything works correctly:\n")
print(files_ordered_by_annotation_sim[0], files_ordered_by_annotation_sim[-1])

Check if everything works correctly:

('568273.mp3', np.float64(-0.11429878976245067)) ('203149.mp3', np.float64(0.9488383560421622))


In [13]:
print("Files for which different annotations are the most dissimilar: ")
filename, similarity = files_ordered_by_annotation_sim[0]

filtered_meta = metadata[metadata['filename'] == filename]

file_annotations = annotations[annotations['filename'] == filename]
file_annotators = file_annotations['annotator'].unique().tolist()

filtered_annotations_texts = [[] for _ in range(len(file_annotators))]
for i, annotator in enumerate(file_annotators):
    filtered_annotations = file_annotations[file_annotations['annotator'] == annotator]
    for text in filtered_annotations['text']:
        filtered_annotations_texts[i].append(text)

pd.set_option('display.max_colwidth', None)

print(f"Filename: {filename}")
print(f"Similarity: {similarity}")
print(f"Metadata title: {filtered_meta['title'].values[0]}")
print(f"Metadata keywords: {filtered_meta['keywords'].values[0]}\n")

for i, annotator in enumerate(file_annotators):
    print(f"Annotator {i}: {annotator}")
    print(f"Annotations: {filtered_annotations_texts[i]}\n")

Audio(DATASET_PATH + "/audio/" + filename)

Files for which different annotations are the most dissimilar: 
Filename: 568273.mp3
Similarity: -0.11429878976245067
Metadata title: spectral violin drone processed through granulation and reverb
Metadata keywords: spectral, tonal, granulation, horror, drone, dark, avant-garde, ambient, violin, soundscape, experimental

Annotator 0: 38166745052496887069994378107692975296585346461051494344180693789505987975216
Annotations: ['A sharp, loud violin plays rapidly at a concert.']

Annotator 1: 67403457891108696056353927513825894277157080037650893722622884910463124172564
Annotations: ['A sustained ambient drone with granular and spectral textures. ']



In [14]:
print("Files for which different annotations are the most dissimilar: ")
filename, similarity = files_ordered_by_annotation_sim[-1]

filtered_meta = metadata[metadata['filename'] == filename]

file_annotations = annotations[annotations['filename'] == filename]
file_annotators = file_annotations['annotator'].unique().tolist()

filtered_annotations_texts = [[] for _ in range(len(file_annotators))]
for i, annotator in enumerate(file_annotators):
    filtered_annotations = file_annotations[file_annotations['annotator'] == annotator]
    for text in filtered_annotations['text']:
        filtered_annotations_texts[i].append(text)

pd.set_option('display.max_colwidth', None)

print(f"Filename: {filename}")
print(f"Similarity: {similarity}")
print(f"Metadata title: {filtered_meta['title'].values[0]}")
print(f"Metadata keywords: {filtered_meta['keywords'].values[0]}\n")

for i, annotator in enumerate(file_annotators):
    print(f"Annotator {i}: {annotator}")
    print(f"Annotations: {filtered_annotations_texts[i]}\n")

Audio(DATASET_PATH + "/audio/" + filename)

Files for which different annotations are the most dissimilar: 
Filename: 203149.mp3
Similarity: 0.9488383560421622
Metadata title: End of the afternoon in a field, in Nebraska
Metadata keywords: CD130519T018, felix, cows, singing, birds, end, usa, field, call, bird, evening, countryside, bulls, moo, bull, sing, mooing, blume, calls, cow, fields, calling, nebraska, afternoon

Annotator 0: 83798156097639272746679852169789049963467405372196161228635156944547831524295
Annotations: ['cows and bulls calling and mooing', 'Cows and bulls calling and mooing', 'Birds singing', 'birds singing in the country side', 'Birds singiing', 'Birds singing', 'Birds singing', 'Cows and bulls calling', 'Birds singing', 'Cows and bulls calling and mooing', 'cows and bulls calling and mooing', 'Cows and bulls calling and mooing', 'birds singing']

Annotator 1: 55340340813223415328630925528458732286311395013930503995352353850108948193755
Annotations: ['cows and bulls mooing', 'birds singing']

