# Exploration of Sanaga set

## Create overviews of recordings

In [None]:
import os
import glob
import pandas as pd
import datetime
import re
import subprocess

def read_files_vocalizations(raven_path):
    os.chdir(raven_path)
    recorder = []
    num_vocalizations = []
    filelist = []

    for file in glob.glob('*.txt'):
        matchObj=re.search("A[0-9]+", file)
        recorder.append(matchObj.group())
        num_vocalizations.append(count_vocalizations(file, 'Chimpanzee'))
        filelist.append(file)

    d = {'file': filelist, 'recorder':recorder,'vocalizations':num_vocalizations}
    return pd.DataFrame(d)

def get_folder_stats(path, recorder):
    os.chdir(path + recorder + '/')
    df = pd.DataFrame(glob.glob('*.WAV'))

    df.columns = ['files']
    df = df.sort_values('files', ignore_index=True)

    startstr = str(list(df.iloc[0])[0][:-4])
    endstr = str(list(df.iloc[-1])[0][:-4])

    starttime=datetime.datetime.strptime(startstr, "%Y%m%d_%H%M%S")
    endtime = datetime.datetime.strptime(endstr, "%Y%m%d_%H%M%S")
    duration = endtime-starttime

    return [recorder, starttime, endtime, duration]

def list_folder_stats(path):
    data = []
    
    for recorder in os.listdir(path):
        data.append(get_folder_stats(path, recorder))
    return pd.DataFrame(data, columns=['recorder', 'start', 'end', 'duration'])

def count_vocalizations(file, species):
    df = process_raven(file, species)
    return df.shape[0]

def process_raven(file, species):
    df = pd.read_table(file)
    df.columns = df.columns.str.lower()
    df = df.loc[(df['class'] == species)]
    df = df.rename(columns={'begin path': 'begin_path', 'end path': 'end_path'})
    df['begin_path'] = df['begin_path'].str.replace('.*\\\\','', regex=True)
    df['begin_path'] = df['begin_path'].str.replace('\.WAV','', regex=True)
    df['end_path'] = df['end_path'].str.replace('.*\\\\','', regex=True)
    df['end_path'] = df['end_path'].str.replace('\.WAV','', regex=True)
    df['start_time'] = df['file offset (s)']
    df['end_time'] = (df['file offset (s)']
                            + df['end time (s)'] - df['begin time (s)'])
    df = df_file_to_timestamp(df)
    return df

def read_processed_csv(path, recorder):
    df = pd.read_csv(path + recorder  + '.csv')
    df["start_datetime"] = pd.to_datetime(df["start_datetime"])
    df["end_datetime"] = pd.to_datetime(df["end_datetime"])
    return df

def df_file_to_timestamp(df):
    df['timestamp_beginfile'] = pd.to_datetime(df['begin_path'], format="%Y%m%d_%H%M%S")
    df['timestamp_endfile'] = pd.to_datetime(df['end_path'], format="%Y%m%d_%H%M%S")
    df['delta_t'] = df['timestamp_endfile']-df['timestamp_beginfile']
    return df

def wav_list(base_wav_path, recorder):
    wav_path = base_wav_path + recorder + '/'
    filelist = sorted(os.listdir(wav_path))
    filelist = list(filter(lambda k: 'WAV' in k, filelist))
    filelist = [sub[ : -4] for sub in filelist]
    return filelist

def yoda_get(file, source, dest):
    filepath = source + file
    p = subprocess.Popen(['iget', filepath, dest])
    p.wait()

In [None]:
path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'
raven_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga/'
#path = '/home/jelle/Repositories/animalsounds/data/sanaga/'

df_overview = list_folder_stats(path)
df_vocalizations = read_files_vocalizations(raven_path)
df_summary = df_overview.merge(df_vocalizations, how='left', on='recorder')
df_summary = df_summary.dropna()
df_summary['vocalizations'] = df_summary['vocalizations'].astype(int)
df_summary

In [None]:
transects = {'recorder':['A6', 'A2', 'A22', 'A5', 'A11', 'A21', 'A1', 'A23', 'A3', 'A4', 'A38', 'A31', 'A26', 'A40'], 
             'transect':['Mintak', 'Mintak', 'Mintak', 'Mintak', 'Mintak', 'Jacky', 'Jacky', 'Jacky', 'Jacky', 'Jacky', 'Bikols', 'Bikols', 'Bikols', 'Bikols'],
             'Distance':[0, 200, 400, 600, 800, 0, 200, 400, 600, 800, 0, 200, 400, 600]}
df_transects = pd.DataFrame(transects)
df_transects

# Process raven files and save csv files of processed data
Output will be a csv file per folder with filenames (.WAV) and respective timestamps of start and end time per vocalization

In [None]:
raven_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga/'

In [None]:
file = raven_path + '20210218_153318.Table.1.selections A38_JZ.txt'
species = 'Chimpanzee'
num_vocal = count_vocalizations(file, species)
num_vocal


In [None]:
import librosa
# Create file list
def fix_multifile(df_out, filelist, wav_path):
    row_i = list(df_out[(df_out['begin_path']
                         != df_out['end_path'])].index)
    for i in row_i:
        last_index = df_out.index[-1]
        filelength = librosa.get_duration(filename=wav_path + df_out.loc[i, 'begin_path'] + '.WAV')
        
        df = pd.DataFrame([[df_out.loc[i, 'begin_path'],
                            df_out.loc[i, 'begin_path'],
                            df_out.loc[i, 'start_time'],
                            filelength],
                           [df_out.loc[i, 'end_path'],
                            df_out.loc[i, 'end_path'],
                            0.0,
                            df_out.loc[i, 'end_time']-filelength]],
                          columns=['begin_path',
                                   'end_path',
                                   'start_time',
                                   'end_time'])
        df.index = [last_index+1, last_index+2]
        df_out = df_out.append(df, ignore_index=False)

        if df_out.loc[i, 'delta_t'].total_seconds() > 65:
            for j in range(filelist.index(df_out.loc[i, 'begin_path'])
                           + 1, filelist.index(df_out.loc[i, 'end_path'])):
               
                last_index = df_out.index[-1]
                df = pd.DataFrame([[filelist[j],
                                    filelist[j],
                                    0.0,
                                    librosa.get_duration(filename=wav_path + filelist[j] + '.WAV')]],
                                  columns=['begin_path',
                                           'end_path',
                                           'start_time',
                                           'end_time'])
                df.index = [last_index+1]
            df_out = df_out.append(df, ignore_index=False)
    return df_out.drop(row_i)

In [None]:
# drop irrelevant columns
def clean_up(df):
    df = df.loc[:, ['begin_path', 
                    'start_time', 
                    'end_time']]
    df = df.drop(df[df['end_time'] - df['start_time'] < 0.2].index)
    df = df.reset_index()
    df = df.drop(columns=['index'])
    df['start_datetime'] = pd.to_datetime(df['begin_path'], format="%Y%m%d_%H%M%S") + \
                           pd.to_timedelta(df['start_time'], unit='seconds')
    df['end_datetime'] = pd.to_datetime(df['begin_path'], format="%Y%m%d_%H%M%S") + \
                         pd.to_timedelta(df['end_time'], unit='seconds')
    return df

In [None]:
base_wav_path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'



for index, f in enumerate(list(df_summary['file'])):
    
    file = raven_path + f
    species = 'Chimpanzee'
    recorder = re.search("A[0-9]+", f).group(0)
    
    print(recorder)

    filelist = wav_list(base_wav_path, recorder)
    df = process_raven(file, species)
    df = fix_multifile(df, filelist, wav_path)

    df = clean_up(df)
    df['recorder'] = recorder
    df.to_csv('/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/' + recorder  + '.csv')
    if index == 0:
        df_all = df
    else:
        df_all.append(df, ignore_index=True)


In [None]:
# The cell above for a single recorder
base_wav_path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'
recorder = 'A38'
wav_path = base_wav_path + recorder + '/'
filelist = wav_list(base_wav_path, recorder)
df = process_raven(file, species)
df = fix_multifile(df, filelist, wav_path)

df = clean_up(df)
df['recorder'] = recorder
df.to_csv('/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/' + recorder  + '.csv')

In [None]:
yodapath = '/nluu6p/home/research-zwerts/data/sanaga/A6/'
destination = '/home/jelle/Repositories/animalsounds/data/sanaga/A6/'

# create Unique list 
for file in list(df['begin_path'].unique()):
    
    print(file + '.WAV')
    yoda_get(file + '.WAV', yodapath, destination)

# Find vocalizations in A6 & A22

In [None]:
## Change order, first fix multifile, then timestamps

## get 2 dataframes
processed_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/'





df6 = read_processed_csv(processed_path, recorder = 'A6')
recorder = 'A22'
df22 = read_processed_csv(processed_path, recorder = 'A22')


In [None]:
## Calculate fraction found based on timestamps
### calculate df22 extent (create function cell 1)
path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'
recorder = 'A22'
A22_overview = get_folder_stats(path, recorder)
A22_overview



In [None]:
from collections import deque

### create df6 subset
df6_subset = df6[(df6["start_datetime"] > A22_overview[1]) & (df6["end_datetime"] < A22_overview[2])]

## sort dataframes
df22 = df22.sort_values(by='start_datetime')
df6_subset = df6_subset.sort_values(by='start_datetime')

# WITHIN
main = deque(list(df6_subset[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))
dist = deque(list(df22[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))

dist_start, dist_end = dist[0]

dist[0][0]

In [None]:
pd.options.display.max_rows = 100
display(df22)
pd.options.display.max_rows = 10

In [None]:
hit_types = []
new_time = 0.0
within_time = 0.0
not_found = 0.0
margins = 0.0

while(len(dist) > 0):
    dist_start, dist_end = dist[0]
    
    while(len(main) > 0):
        main_start, main_end = main[0]
        if len(hit_types) > 0:
            print(hit_types[-1])
        if main_end < dist_start:
            #            <----- dist ----->
            # <-main->
            # main vocalization did not reach a dist vocalization yet
            # remove the current vocalization, move to the next
            hit_types.append(('main before dist', new_time, main_start, 
                main_end, dist_start, dist_end))
            
            not_found += (main_end - main_start).total_seconds()
            main.popleft()
            # break the while loop to start next
            
        elif main_start <= dist_start and main_end >= dist_end:
            #    <------- dist ------->       <--?dist?-->
            # <---------- main ----------------------------->
            # this is what we expect
            within_time += (dist_end - dist_start).total_seconds()
            margins += (dist_start - main_start).total_seconds()
            
            # However if there is another distant inside 
            # the main vocalization we need to account for this (MAYBE NOT NEEDED IF WE JUST NEED WITHIN TIME)
            if len(dist) > 1:
                if dist[1][0] < main_end:
                    print("TWO distant contained")
                    margins += (main_end - dist_end).total_seconds()
                    if dist[2][0] < main_end:
                        print("CHAOS")
                        
                    if main_end <= dist[1][1]:
                        within_time += (main_end - dist_start).total_seconds()
                        new_time += (dist_end - main_end).total_seconds()
                    else:
                        within_time += (dist_end - dist_start).total_seconds()
                        margins += (main_end - dist_end).total_seconds()
                    dist.popleft()
                else:
                    margins += (main_end - dist_end).total_seconds()
            else:
                margins += (main_end - dist_end).total_seconds()
            
            hit_types.append(('perfect', new_time, main_start, 
                            main_end, dist_start, dist_end))

            dist.popleft()
            break
            
        elif main_start <= dist_start and main_end > dist_start and main_end <= dist_end:
            #          <----- dist ----->
            # <-----main-------->
            # this is bad
            within_time += (main_end - dist_start).total_seconds()
            new_time += (dist_end - main_end).total_seconds()
            hit_types.append(('distant vocalization ends too late', new_time, main_start, 
                            main_end, dist_start, dist_end))
            # remove the current negative interval
            main.popleft()
            # and break the loop to start with the next negative interval
            
        elif main_start >= dist_start and main_end <= dist_end:
            #    <----- dist ----->
            #         <-main-> <----main->
            # this is bad
            within_time += (main_end - main_start).total_seconds()
            new_time += (main_start - dist_start).total_seconds()
            new_time += (dist_end - main_end).total_seconds()
            hit_types.append(('distant voc is longer on both sides', new_time, main_start, 
                            main_end, dist_start, dist_end))
            # remove the current negative interval
            main.popleft()
            # and break the loop to start with the next dist interval (REMOVE this break???, yes we probably could, but we have to fix the new_time then, AND check the "else" part)
            
        elif main_start >= dist_start and main_start < dist_end and main_end >= dist_end:
            #    <----- dist ----->
            #                 <-main->
            # this is bad
            within_time += (dist_end - main_start).total_seconds()
            new_time += (main_start - dist_start).total_seconds()
            hit_types.append(('distant vocalization starts too soon',  new_time, main_start, 
                            main_end, dist_start, dist_end))
            # remove the current negative interval
            dist.popleft()
            # and break the loop to start with the next negative interval
            break
        else:
            #  <----- dist ----->
            #                     <----main----->
            # Not found before!
            hit_types.append(('dist before main', new_time, main_start, 
                main_end, dist_start, dist_end))
            new_time += (dist_end - dist_start).total_seconds()
            dist.popleft()
            break
        # if there are no more positive intervals we can quit for this file
        print(len(dist))
        if len(dist) == 0:
            main = [] 
        
              


In [None]:
total_A6 = (df6_subset["end_datetime"] - df6_subset["start_datetime"]).sum().total_seconds()
total_A22 = (df22["end_datetime"] - df22["start_datetime"]).sum().total_seconds()

### Wat willen we vinden??
## Hoeveel van de A6 vocalizaties zijn ook in A22 te horen? >>> Within time
print("Within time = " + str(within_time))

## Hoeveel van de A6 vocalizaties zijn niet in A22 te horen? Total_A6 - within time
print("Not found = " + str(total_A6 - within_time))

## Hoeveel van A22 is nieuw? Total_A22 - within time
print("New found = " + str(total_A22 - within_time))

In [None]:
#df6_subset["duration [s]"] = 
total_A6 = (df6_subset["end_datetime"] - df6_subset["start_datetime"]).sum().total_seconds()
total_A22 = (df22["end_datetime"] - df22["start_datetime"]).sum().total_seconds()

In [None]:
for i in range(1, len(hit_types)):
    if (hit_types[i-1][0] == 'main before dist') and hit_types[i][0] == 'dist before main':
        print(str(i) + "is not found")

# Run for all recorders

In [None]:
from collections import deque

def read_processed_csv(path, recorder):
    df = pd.read_csv(path + recorder  + '.csv')
    df["start_datetime"] = pd.to_datetime(df["start_datetime"])
    df["end_datetime"] = pd.to_datetime(df["end_datetime"])
    return df

def overlap(main, dist):
    hit_types = []
    within_time = 0.0
    dist_start, dist_end = dist[0]
    
    while(len(dist) > 0):
        dist_start, dist_end = dist[0]

        while(len(main) > 0):
            main_start, main_end = main[0]
            #if len(hit_types) > 0:
                #print(hit_types[-1])
            if main_end < dist_start:
                #            <----- dist ----->
                # <-main->
                # main vocalization did not reach a dist vocalization yet
                # remove the current vocalization, move to the next
                hit_types.append(('main before dist', main_start, 
                    main_end, dist_start, dist_end))

                main.popleft()
                # break the while loop to start next

            elif main_start <= dist_start and main_end >= dist_end:
                #    <------- dist ------->       <--?dist?-->
                # <---------- main ----------------------------->
                # this is what we expect
                within_time += (dist_end - dist_start).total_seconds()

                # However if there is another distant inside 
                # the main vocalization we need to account for this (MAYBE NOT NEEDED IF WE JUST NEED WITHIN TIME)
                if len(dist) > 1:
                    if dist[1][0] < main_end:
                        print("TWO distant contained")
                        if dist[2][0] < main_end:
                            print("CHAOS")

                        if main_end <= dist[1][1]:
                            within_time += (main_end - dist_start).total_seconds()
                        else:
                            within_time += (dist_end - dist_start).total_seconds()
                        dist.popleft()
                    #else:
                        
                #else:

                hit_types.append(('perfect', main_start, 
                                main_end, dist_start, dist_end))

                dist.popleft()
                break

            elif main_start <= dist_start and main_end > dist_start and main_end <= dist_end:
                #          <----- dist ----->
                # <-----main-------->
                # this is bad
                within_time += (main_end - dist_start).total_seconds()
                hit_types.append(('distant vocalization ends too late', main_start, 
                                main_end, dist_start, dist_end))
                # remove the current negative interval
                main.popleft()
                # and break the loop to start with the next negative interval

            elif main_start >= dist_start and main_end <= dist_end:
                #    <----- dist ----->
                #         <-main-> <----main->
                # this is bad
                within_time += (main_end - main_start).total_seconds()
                hit_types.append(('distant voc is longer on both sides', main_start, 
                                main_end, dist_start, dist_end))
                # remove the current negative interval
                main.popleft()
                # and break the loop to start with the next dist interval (REMOVE this break???, yes we probably could, but we have to fix the new_time then, AND check the "else" part)

            elif main_start >= dist_start and main_start < dist_end and main_end >= dist_end:
                #    <----- dist ----->
                #                 <-main->
                # this is bad
                within_time += (dist_end - main_start).total_seconds()
                hit_types.append(('distant vocalization starts too soon', main_start, 
                                main_end, dist_start, dist_end))
                # remove the current negative interval
                dist.popleft()
                # and break the loop to start with the next negative interval
                break
            else:
                #  <----- dist ----->
                #                     <----main----->
                # Not found before!
                hit_types.append(('dist before main', main_start, 
                    main_end, dist_start, dist_end))
                dist.popleft()
                break
            # if there are no more positive intervals we can quit for this file
            #print(len(dist))
            if len(dist) == 0:
                main = [] 
    return within_time, hit_types

def calculate_totals(df_main, df_dist, within_time):
    total_main = (df_main["end_datetime"] - df_main["start_datetime"]).sum().total_seconds()
    totals = {'recorder': [rec_dist], 
             'total_main': total_main,
             'within_sec': within_time,
             'not_found': total_main - within_time,
             'new_found': (df_dist["end_datetime"] - df_dist["start_datetime"]).sum().total_seconds()- within_time}

    return pd.DataFrame(totals)


processed_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/'
path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'

df_totals = pd.DataFrame(
    columns=['recorder', 'total_main', 'within_sec', 'not_found', 'new_found'])


for rec_main in ['A6', 'A21', 'A38']:
    
    df_main = read_processed_csv(processed_path, recorder = rec_main)
    main_folder_stats = get_folder_stats(path, rec_main)
    
    if rec_main == 'A6':
        transect = ['A22', 'A5']
    elif rec_main == 'A21':
        transect = ['A1', 'A3', 'A4']
    else:
        transect = ['A26']
        
    for rec_dist in transect:
        
        print(rec_dist)
        
        df_dist = read_processed_csv(processed_path, recorder = rec_dist)

        ## Calculate fraction found based on timestamps

        dist_folder_stats = get_folder_stats(path, rec_dist)

        ### create df6 subset SUBSET VICE VERSA (dist longer than main)
        df_main_subset = df_main[(df_main["start_datetime"] > dist_folder_stats[1]) & (df_main["end_datetime"] < dist_folder_stats[2])]
        df_dist = df_dist[(df_dist["start_datetime"] > main_folder_stats[1]) & (df_dist["end_datetime"] < main_folder_stats[2])]

        ## sort dataframes
        df_dist = df_dist.sort_values(by='start_datetime')
        df_main_subset = df_main_subset.sort_values(by='start_datetime')

        main = deque(list(df_main_subset[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))
        dist = deque(list(df_dist[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))
        within_time, hit_types = overlap(main, dist)
       
        df = calculate_totals(df_main_subset, df_dist, within_time)
        
        df_totals = df_totals.append(df)
#                      
df_totals


In [None]:
df_totals['fraction_found'] = df_totals['within_sec']/df_totals['total_main'] 

test = pd.merge(df_summary, df_transects)
df_final = pd.merge(test, df_totals, how='left')
df_final

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
 
# give a list to the marker argument
sns.lmplot( x="Distance", y="fraction_found", data=df_final, fit_reg=False, hue='transect', legend=False, markers=["o", "x", "1"], scatter_kws={"s": 120, "alpha": 0.5})
 
# Move the legend to an empty part of the plot
plt.legend(loc='lower right')

plt.show()


## create test set

In [None]:
processed_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/'
path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'
rec_main = 'A21'
rec_dist = 'A4'

df_main = read_processed_csv(processed_path, recorder = rec_main)

df_dist = read_processed_csv(processed_path, recorder = rec_dist)

dist_folder_stats = get_folder_stats(path, rec_dist)

df_main_subset = df_main[(df_main["start_datetime"] > dist_folder_stats[1]) & (df_main["end_datetime"] < dist_folder_stats[2])]
base_wav_path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'

filelist = pd.DataFrame(wav_list(base_wav_path, rec_dist), columns=['filename'])
filelist['date_time'] = pd.to_datetime(filelist["filename"], format="%Y%m%d_%H%M%S")
filelist.tail(20)

In [None]:
#wav_path = base_wav_path+rec_dist+'/'
#filelengths = []
#for idx, row in filelist.iterrows():
#    print(row)
#    filelengths.append(librosa.get_duration(filename=wav_path + row['filename'] + '.WAV'))

In [None]:
#filelist['filelength'] = filelengths

In [None]:
# translate main annotation in dist annotations

df_new = pd.DataFrame(columns=["file", "offset", "duration"])
# create loop
for idx, row in df_main_subset.iterrows():
    print(row['start_datetime'])
    
    i = 0
    while filelist.iloc[i+1, 1]< row['start_datetime']:
         i += 1
        
    print(filelist.iloc[i,1])
    start = (row['start_datetime'] - filelist.iloc[i,1]).total_seconds()
    end_seconds = (row['end_datetime'] - filelist.iloc[i,1]).total_seconds()
    print(start)
            
    file = wav_path + filelist.iloc[i,0] + '.WAV'
    # check if file exists and is not corrupt (seconds to filestart < filelength)
    if start < filelist.iloc[i,2] and os.path.getsize(file) > 0:
        if end_seconds < filelist.iloc[i,2]:
            # create and append annotation row
            duration = end_seconds - start
            newrow = 
        else:
            
    else:
        #continue for loop
        print("file error")
#4 check if end time is < filelength

# if #4 is yes, check if next file exists
# create annotation row
# create and append rows

In [None]:
# for each main annotation check preceding filename
import librosa
import os
#main = deque(list(df_main_subset[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))
#dist = deque(list(df_dist[["start_datetime", "end_datetime"]].itertuples(index=False, name=None)))
base_wav_path = '/run/user/1000/gvfs/dav:host=science.data.uu.nl,ssl=true/research-zwerts/data/sanaga/'

filelist = pd.DataFrame(wav_list(base_wav_path, rec_dist), columns=['filename'])
filelist['date_time'] = pd.to_datetime(filelist["filename"], format="%Y%m%d_%H%M%S")

df_new = pd.DataFrame(columns=["file", "offset", "duration"])
df_main_emptylist = []

for j in list(df_main_subset.index):
    starttime = df_main_subset.loc[j,'start_datetime']
    endtime = df_main_subset.loc[j,'end_datetime']

    i=0
    while i < len(filelist)-2:
        
        if (filelist.iloc[i, 1] <= starttime and filelist.iloc[i+1, 1] > starttime):
            print("good")
            print(filelist.iloc[i, 0])
            print(df_main_subset.loc[j, 'begin_path'])
            
            
            # ADD check whether file exists (files

            # Check file lenght
            file_size = os.path.getsize(base_wav_path + rec_dist + '/' + filelist.iloc[i, 0] + '.WAV')
            
            if file_size == 0:
                print("file is empty")
                # remove from prediction set
                filelist = filelist.drop([filelist.index[i]])
                df_main_emptylist.append(j)
                break
            else:
                filelength = librosa.get_duration(filename=base_wav_path + rec_dist + '/' + filelist.iloc[i, 0] + '.WAV')
                print(filelength)
                # Calculate offset
                offset = (starttime - filelist.iloc[i, 1]).total_seconds()
                print("offset is " + str(offset) + " seconds")
                duration = (endtime - starttime).total_seconds()
                if offset > filelength:
                    print("recorder gap")
                    # What to do
                    df_main_emptylist.append(j)
                    break
                    
                if (offset+duration)>filelength:
                    print("Multifile")
                    while (offset+duration)>filelength and duration > 0.0 and i<len(filelist)-1:
                        ## Check if next file exists
                        file_size = os.path.getsize(base_wav_path + rec_dist + '/' + filelist.iloc[i+1, 0] + '.WAV')
                        if file_size == 0:
                            print("second file in multifile annotation is empty")
                            # This part should be removed from df_main
                            break
                            
                        duration_1 = filelength-offset

                        df_new = df_new.append({"file": filelist.iloc[i, 0] + '.WAV',
                                                "offset":  offset,
                                                "duration": duration_1,
                                                }, ignore_index=True)
                        i+=1
                        starttime_newfile = filelist.iloc[i,1]
                        duration = (endtime - starttime_newfile).total_seconds()
                        filelength = librosa.get_duration(filename=base_wav_path + rec_dist + '/' + filelist.iloc[i, 0] + '.WAV')
                        offset=0.0
                        

                    df_new = df_new.append({"file": filelist.iloc[i, 0] + '.WAV',
                                "offset":  offset,
                                "duration": duration,
                                }, ignore_index=True)
                    # create row,update offset check again, how to deal with index?
                else:
                    df_new = df_new.append({"file": filelist.iloc[i, 0] + '.WAV',
                                            "offset":  offset,
                                            "duration": duration,
                                            }, ignore_index=True)

            break

        i+=1

df_new

In [None]:
folder = 'A4'
output_path = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_test/chimpanzee/'
df_new.to_csv(output_path + folder + '.csv')

In [None]:
df = pd.read_csv('/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_test/chimpanzee/' + folder + '.csv')
df.head()

yodapath = '/nluu6p/home/research-zwerts/data/sanaga/' + folder + '/'
destination = '/home/jelle/Repositories/animalsounds/data/sanaga/' + folder + '/'

# create Unique list 
for file in list(df['file'].unique()):
    
    print(file)
    yoda_get(file, yodapath, destination)
    
annotations_path = '/home/jelle/Repositories/animalsounds/data/sanaga/' + folder + '/'
output_path = '/home/jelle/Repositories/animalsounds/data/sanaga_test/chimps/' + folder + '/'

for index, row in df.iterrows():
    # Open
    filepath = annotations_path + row['file']
    y, sr = librosa.load(filepath, sr=48000,
                             offset=row['offset'],
                             duration=row['duration'])
    outfile = (output_path + str(index) + '_sanaga_' + row['file'][0:-4] + '_' + str(row['offset']) + '.wav')
    print(outfile)
    sf.write(outfile, y, sr)

In [None]:
# 
import librosa
import soundfile as sf

annotations_path = '/home/jelle/Repositories/animalsounds/data/sanaga/' + folder + '/'
output_path = '/home/jelle/Repositories/animalsounds/data/sanaga_test/chimps/' + folder + '/'

for index, row in df.iterrows():
    # Open
    filepath = annotations_path + row['file']
    y, sr = librosa.load(filepath, sr=48000,
                             offset=row['offset'],
                             duration=row['duration'])
    outfile = (output_path + str(index) + '_sanaga_' + row['file'][0:-4] + '_' + str(row['offset']) + '.wav')
    print(outfile)
    sf.write(outfile, y, sr)


In [None]:
# for Sanaga processed csv format (only run for main recorders)

folder = 'A38'
file = '/home/jelle/Repositories/animalsounds/data/raven_annotations/sanaga_processed/' + folder + '.csv'
df = pd.read_csv(file)

In [None]:
yodapath = '/nluu6p/home/research-zwerts/data/sanaga/' + folder + '/'
destination = '/home/jelle/Repositories/animalsounds/data/sanaga/' + folder + '/'

# create Unique list 
for file in list(df['begin_path'].unique()):
    
    print(file)
    yoda_get(file + '.WAV', yodapath, destination)

In [None]:
import librosa
import soundfile as sf

annotations_path = '/home/jelle/Repositories/animalsounds/data/sanaga/' + folder + '/'
output_path = '/home/jelle/Repositories/animalsounds/data/sanaga_test/chimps/' + folder + '/'

for index, row in df.iterrows():
    # Open
    filepath = annotations_path + row['begin_path'] + '.WAV'
    duration = row['end_time']-row['start_time']
    y, sr = librosa.load(filepath, sr=48000,
                             offset=row['start_time'],
                             duration=duration)
    outfile = (output_path + str(index) + '_sanaga_' + row['begin_path'] + '_' + str(row['start_time']) + '.wav')
    print(outfile)
    sf.write(outfile, y, sr)