# APLOSE Simple analysis

## How to get a first insight on an APLOSE annotation campaign?


#### Import all needed Python modules

In [1]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
from collections import Counter

### Rearrange DCLDE annotations to match collaborative annotation format

In [2]:
# Set path to CSV file containing DCLDE annotations for the analyzed subset in the first APLOSE annotation campaign
path_csv_annotations = '../csvs_dir/annota_campaign_50h.csv'
# Set path to CSV file containing the small wavs dates and names (provided by the administrator user to launch
# an annotation campaign).
path_files_start = '../csvs_dir/Dataset_files.csv'

# Predefined duration of small wavs
duration_small_wav = 320  # [s]

# Read all annotations from DCLDE and parse dates
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
df_annotations = pd.read_csv(path_csv_annotations, parse_dates=[3], date_parser=dateparse)

# Rename columns
df_annotations.columns = ['Deployment', 'Site', 'Specie', 'start', 'end', 'annotation']

# Convert string ending times to pandas timestamp
df_annotations['end'] = df_annotations['end'].apply(dateparse)

# Drop unnecessary columns
df_annotations.drop(columns=['Deployment', 'Site', 'Specie'], axis=1, inplace=True)

FileNotFoundError: [Errno 2] File b'../csvs_dir/annota_campaign_50h.csv' does not exist: b'../csvs_dir/annota_campaign_50h.csv'

In [None]:
# Read CSV file containing filenames and associated starting dates for the campaign
dateparse = lambda x: pd.datetime.strptime(x, '"%Y-%m-%d %H:%M:%S"') # date parser for the CSV file with small wav dates

# Read CSV and parse dates
df_file_start = pd.read_csv(path_files_start, sep=', ', usecols=['filename', 'audio_start', 'audio_end'],
                            parse_dates=['audio_start'], date_parser=dateparse)

# Convert audio starts to timestamps
df_file_start['audio_end'] = df_file_start['audio_end'].apply(dateparse)

In [None]:
# Match small wav filenames with origianl DCLDE annotations.
df_annotations[['filename', 'start_time', 'end_time']] = df_annotations.apply(lambda r: row_function(r, df_file_start, duration_small_wav), axis=1)
df_annotations.drop(['start', 'end'], axis=1, inplace=True)

# Change column order to match the one used in the collaborative campaign
cols = df_annotations.columns.tolist()
cols = cols[1:] + [cols[0]]
df_annotations = df_annotations[cols]

# Add a column with the 'annotator', here DCLDE experts
df_annotations['annotator'] = "DCLDE_exp"
# Replace all '40Hz' labels by '40-Hz'
df_annotations['annotation'] = df_annotations['annotation'].apply(lambda x: '40-Hz' if x == '40Hz' else 'Dcall')
del df_file_start

### Process annotation campaign results

In [None]:
# If you need to update results you can change this number to keep older ones and only analyze new ones
nb = 290420

# Path to collaborative annotations
path_collaborative_annot = '../csvs_dir/DCLDE_LF camapign' + str(nb) + '.csv'

# Read collaborative annotations
df_collab_annot = pd.read_csv(path_collaborative_annot, usecols=['filename', 'start_time', 'end_time', 'annotation', 'annotator'])

# Balcklist annotators (Names of annotators are provided to the administrator
# of the campaign, please refer to the user guide for more details)
blacklist = ['Sydney', 'Brest', 'Durban', 'Nice', 'Bali', 'Southampton', 'Cadiz']

# Remove annotators that did not achieve their annotation campaign
df_collab_annot = df_collab_annot[~df_collab_annot.annotator.isin(blacklist)]

### Merge all annotations (DCLDE experts and collaborative ones)

In [None]:
# Concat all results
results = pd.concat([df_annotations, df_collab_annot], ignore_index=True)

# Store the results in a CSV file
results.to_csv('../csvs_dir/result_dataframe' + str(nb) + '.csv', index=False, header=True)

### Plot the number of annotations per calls per annotators

In [None]:
# Remove DCLDE experts from this plot
results = results[~results.annotator.isin(['DCLDE_exp'])]
annotat = np.unique(results.annotator.values)

# Check the number of annotations of each annotator
# This will return a two-level index dataframe ['annotator', 'annoatation'] with one column: 'nb_labels'
counter_annotator = results.groupby('annotator')['annotation'].apply(Counter).to_frame('nb_labels')

In [None]:
# Display some infos on the annotations
print(counter_annotator.unstack().describe())

In [None]:
# Plot annotations of each annotator
fig, ax = plt.subplots()
ax = counter_annotator.unstack().plot(kind='bar', ax=ax)
plt.ylabel('Nber of labels')
plt.xlabel('Annotator')
plt.xticks(rotation=0)
ax.set_xticklabels(['A' + str(i) if i > 0 else 'DCLDE_exp' for i in range(len(annotat))])
ax.legend(["D-calls", "40-Hz", "Unknown call"])
# plt.savefig('../figs/update' + str(nb) + '.png')
plt.show()

### Plot durations of a task

In [None]:
# Set path
path_csv = '../csvs_dir'

# Set the 'nb' for the new results from the annotation campaign
nb = 290420

# Read all annotations without some annotators
name_csv = 'result_dataframe' + str(nb) + '.csv'
csv_file = os.path.join(path_csv, name_csv)

In [None]:
# Path to collaborative annotations (here duration of a task. This CSV file is provided to the administrator)
path_collaborative_annot = 'DCLDE_LF campaign_avril2020.csv'
# Read collaborative annotations
time_results = pd.read_csv(path_collaborative_annot)

# Annotators not to keep
blacklist = ['DCLDE_exp', 'Sydney', 'Brest', 'Nice', 'Durban', 'Bali', 'Southampton', 'Cadiz']

# Remove some annotators
time_results = time_results[~time_results['annotator'].isin(blacklist)]

# Annotator in the collaborative camapaign
annotators = sorted(time_results.annotator.unique().tolist())

In [None]:
# Sum annotation times if annotators re labels some files
df = time_results.drop(['dataset', 'start_time', 'end_time'], axis=1).groupby(['filename', 'annotator']).sum()
df.reset_index(drop=True, inplace=True, level=0)

# fig, ax = plt.subplots()
ax_bp, bp = df.boxplot(column='duration',
                       by='annotator',
                       return_type='both', showmeans=True)['duration']
medians = [median.get_ydata()[0] for median in bp["medians"]]
for line in bp['medians']:
    # get position data for median line
    x, y = line.get_xydata()[1]  # top of median line
    # overlay median value
    text(x, y, '%.1f' % y,
         horizontalalignment='right')
ax_bp.set_ylim([0, 500])
fig = ax_bp.get_figure()
fig.suptitle('')
ax_bp.set_title('')
ax_bp.set_ylabel('Annotation time [s]')
ax_bp.set_xlabel('Annotators')
ax_bp.set_xticklabels(['A' + str(i) for i in range(1, len(annotators) + 1)])
# plt.savefig(os.path.join(figs_path, 'time_annotators.png'), bbox='tight')

# Mean std
print(df.groupby('annotator').describe())

In [None]:
# Display some stats on annotators and highest durations
print(df.nlargest(20, columns='duration').groupby('annotator').count())
print(df.nlargest(30, columns='duration'))