## IOU Testing

Use the statistics.py file in the passive-acoustic-biodiversity repo
time library used to to time function calls

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from PyHa.IsoAutio import generate_automated_labels
# clip_IoU is the fastest one, others are there for posterity and comparison
from PyHa.statistics_iou import clip_IoU, clip_IoU_orig, clip_IoU_lin, clip_IoU_skip, matrix_IoU_Scores
from PyHa.statistics_iou import automated_labeling_statistics

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import time

In [3]:
output_df = {
    "FUNCTION": [],
    "TECHNIQUE": [],
    "DURATION": [],
    "ITERATIONS": [],
    "AVERAGE": []
}
path =  "../acoustic-id-test-data/TEST/"
path2 = "./ScreamingPiha_Manual_Labels.csv"

In [4]:
isolation_parameters = {
    "model" : "microfaune",
    "technique" : "chunk",
    "threshold_type" : "median",
    "threshold_const" : 2.0,
    "threshold_min" : 0.0,
    "chunk_size" : 5.0,
    "window_size" : 2.0
}

manual_df = pd.read_csv(path2)
automated_df = generate_automated_labels(path, isolation_parameters)

In [6]:
# compare the average time for 10 trials of the different clip_IoU methods for ~600 clips 
def call_IoU(f, automated_df, manual_df):
    start = time.time()
    matrix = f(automated_df, manual_df)
    tim = int((time.time() - start) * 10) / 10.0
    print('time:', tim, 'seconds')
    return matrix, tim

# double_scalars error comes from dividing two very small numbers
# this is handled by converting NaN values in the IoU matrix to 0 when it is returned
print("orig")
time_orig = []
for i in range(10):
    orig, tim = call_IoU(clip_IoU_orig, automated_df, manual_df)
    time_orig.append(tim)
print(sum(time_orig) / 10)


print("\nskip")
time_skip = []
for i in range(10):
    skip, tim = call_IoU(clip_IoU_skip, automated_df, manual_df)
    time_skip.append(tim)
print(sum(time_skip) / 10)


print("\nlinalg")
time_lin = []
for i in range(10):
    lin, tim = call_IoU(clip_IoU_lin, automated_df, manual_df)
    time_lin.append(tim)
print(sum(time_lin) / 10)


print("\nlinalg + skip")
time_linskip = []
for i in range(10):
    linskip, tim = call_IoU(clip_IoU, automated_df, manual_df)
    time_linskip.append(tim)
print(sum(time_linskip) / 10)

#note that although it says screamingpiha1 (since clip_IoU is supposed to take in one clip), it is for the whole dataset

orig
time: 466.9 seconds
time: 502.0 seconds
time: 482.4 seconds
time: 481.1 seconds
time: 486.0 seconds
time: 477.7 seconds
time: 476.2 seconds
time: 525.9 seconds
time: 513.4 seconds
time: 470.0 seconds
488.15999999999997

skip
time: 274.1 seconds
time: 275.4 seconds
time: 307.4 seconds
time: 288.9 seconds
time: 278.8 seconds
time: 285.4 seconds
time: 252.4 seconds
time: 267.7 seconds
time: 270.5 seconds
time: 259.8 seconds
276.04

linalg
time: 190.2 seconds
time: 143.0 seconds
time: 134.4 seconds
time: 139.2 seconds
time: 143.2 seconds
time: 134.2 seconds
time: 153.1 seconds
time: 148.3 seconds
time: 127.1 seconds
time: 130.6 seconds
144.32999999999998

linalg + skip
time: 16.0 seconds
time: 11.7 seconds
time: 11.7 seconds
time: 11.5 seconds
time: 11.3 seconds
time: 11.4 seconds
time: 11.5 seconds
time: 10.8 seconds
time: 17.1 seconds
time: 13.5 seconds
12.65


In [7]:
print("avg across 10 trials for clip_IoU")
print("original:\t", sum(time_orig) / 10)
print("skip:\t\t", sum(time_skip) / 10)
print("linalg:\t\t", sum(time_lin) / 10)
print("linalg+skip:\t", sum(time_linskip) / 10)


avg across 10 trials for clip_IoU
original:	 488.15999999999997
skip:		 276.04
linalg:		 144.32999999999998
linalg+skip:	 12.65


## Testing automated_labeling_statistics

In [18]:
# call just once
def call_stats(automated_df, manual_df, stats_type="IoU", threshold=0.5):
    start = time.time()
    stats_df = automated_labeling_statistics(automated_df,manual_df,stats_type,threshold)
    print(stats_type, 'time:\t', int((time.time() - start) * 10) / 10.0, 'seconds\n')
    return stats_df

stats_df_skip = call_stats(automated_df, manual_df, stats_type="IoU-skip")
stats_df_lin = call_stats(automated_df, manual_df, stats_type="IoU-lin")
stats_df_lin_skip = call_stats(automated_df, manual_df, stats_type="IoU-lin-skip")

IoU-skip time:	 50.4 seconds

IoU-lin time:	 35.0 seconds

IoU-lin-skip time:	 6.6 seconds



In [5]:
# compare the average time for 10 trials of automated_labeling_statitistics using different clip_IoU methods for ~600 clips 
def call_stats(automated_df, manual_df, stats_type="IoU", threshold=0.5):
    start = time.time()
    stats_df = automated_labeling_statistics(automated_df,manual_df,stats_type,threshold)
    tim = int((time.time() - start) * 10) / 10.0
    print('time:', tim, 'seconds')
    return stats_df, tim

# double_scalars error comes from dividing two very small numbers
# this is handled by converting NaN values in the IoU matrix to 0 when it is returned
print("orig")
stats_time_orig = []
for i in range(10):
    stats_df_skip, tim = call_stats(automated_df, manual_df, stats_type="IoU-orig")
    stats_time_orig.append(tim)
print(sum(stats_time_orig) / 10)

print("\nskip")
stats_time_skip = []
for i in range(10):
    stats_df_skip, tim = call_stats(automated_df, manual_df, stats_type="IoU-skip")
    stats_time_skip.append(tim)
print(sum(stats_time_skip) / 10)


print("\nlinalg")
stats_time_lin = []
for i in range(10):
    stats_df_lin, tim = call_stats(automated_df, manual_df, stats_type="IoU-lin")
    stats_time_lin.append(tim)
print(sum(stats_time_lin) / 10)

print("\nlinalg + skip")
stats_time_linskip = []
for i in range(10):
    stats_df_linskip, tim = call_stats(automated_df, manual_df, stats_type="IoU-lin-skip")
    stats_time_linskip.append(tim)
print(sum(stats_time_linskip) / 10)

#note that although it says screamingpiha1 (since clip_IoU is supposed to take in one clip), it is for the whole dataset

orig
time: 95.9 seconds
time: 94.5 seconds
time: 98.0 seconds
time: 84.9 seconds
time: 79.2 seconds
time: 91.0 seconds
time: 88.0 seconds
time: 83.4 seconds
time: 89.9 seconds
time: 88.0 seconds
89.28

skip
time: 41.9 seconds
time: 52.6 seconds
time: 41.5 seconds
time: 46.4 seconds
time: 49.7 seconds
time: 49.6 seconds
time: 45.3 seconds
time: 47.5 seconds
time: 54.9 seconds
time: 46.1 seconds
47.550000000000004

linalg
time: 29.4 seconds
time: 32.4 seconds
time: 27.3 seconds
time: 28.5 seconds
time: 30.8 seconds
time: 32.3 seconds
time: 37.3 seconds
time: 33.0 seconds
time: 30.3 seconds
time: 31.0 seconds
31.23

linalg + skip
time: 7.9 seconds
time: 7.5 seconds
time: 7.3 seconds
time: 5.7 seconds
time: 5.6 seconds
time: 5.9 seconds
time: 5.7 seconds
time: 5.7 seconds
time: 5.6 seconds
time: 8.3 seconds
6.5200000000000005


In [6]:
print("avg across 10 trials for automated_labeling_statistics")
print("original:\t", sum(stats_time_orig) / 10)
print("skip:\t\t", round(sum(stats_time_skip) / 10,4))
print("linalg:\t\t", sum(stats_time_lin) / 10)
print("linalg+skip:\t", round(sum(stats_time_linskip) / 10,4))

avg across 10 trials for automated_labeling_statistics
original:	 89.28
skip:		 47.55
linalg:		 31.23
linalg+skip:	 6.52


In [None]:
display(stats_df_skip)
display(stats_df_lin)
display(stats_df_lin_skip)

### Importing labels
If you put a subset of files in a different folder, generate a new labels .csv, then read them into a DataFrame.

In [12]:
path_to_audio_files = "../acoustic-id-test-data/subset/" 
path_to_ground_truth = "./025_DSC180_subset_manual.csv"

In [None]:
# generate a csv with just the labels for specified files
import os
files = os.listdir(path_to_audio_files)
dscdf = pd.read_csv(path_to_ground_truth)
dscdf = dscdf[dscdf["IN FILE"].isin(files)]
dscdf.to_csv("025_DSC180_subset_manual.csv")
path_to_ground_truth = "025_DSC180_subset_manual.csv"

In [None]:
manual_df = pd.read_csv(path_to_ground_truth)
isolation_parameters = {
    "model" : "microfaune",
    "technique" : "chunk",
    "threshold_type" : "median",
    "threshold_const" : 2.0,
    "threshold_min" : 0.0,
    "chunk_size" : 3.0,
    "window_size" : 2.0
}
#automated_df = generate_automated_labels(path_to_audio_files, isolation_parameters)
#automated_df.to_csv("025_DSC180_subset_automated.csv")
automated_df = pd.read_csv("025_DSC180_subset_automated.csv")

In [None]:
manual_df["FOLDER"] = path_to_audio_files
manual_df

In [None]:
%reload_ext autoreload
def call_stats(automated_df, manual_df, stats_type="IoU", threshold=0.5):
    start = time.time()
    stats_df = automated_labeling_statistics(automated_df,manual_df,stats_type, threshold)
    print('time:', int((time.time() - start) * 10) / 10.0, 'seconds')
    return stats_df

stats_df_bn_lin_skip = call_stats(automated_df, manual_df, stats_type="IoU-lin-skip")

In [None]:
stats_df_bn_lin = call_stats(automated_df, manual_df, stats_type="IoU-lin")

In [None]:
stats_df_bn_lin_skip

In [None]:
stats_df_bn_lin

## Verification
ignore this mostly - used to check the shape of lists and/or numpy arrays

In [None]:
print(len(slight), 'x', len(slight[0]))
print(len(lin), 'x', len(lin[0]))

In [None]:
print(slight)

In [None]:
slight.tolist()
print(*slight, sep="\n")

In [None]:
lin.tolist()
print(*lin, sep="\n")

In [None]:
# prints the coords of any differences
import numpy as np
lin = np.nan_to_num(lin)
differences = slight - lin
d = np.nonzero(differences)
d0 = d[0].tolist()
d1 = d[1].tolist()

for i in range(len(d0)):
    print(d0[i], d1[i])