In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np
import datetime

In [2]:
# Read in all metadata (check)
# Reduce the metadata to just the faulty clips (check)
# For each AM, subtract out their start time
# Multiply their times by four
# Add back in the start time
# Convert back to standard time UTC

In [3]:
file_path = "../../acoustic-id-test-data/MadreDeDios_Audiomoth_Dataset_Metadata_with_Microfaune_Baseline_Global_Scores.csv"

In [4]:
metadata_df = pd.read_csv(file_path)
len(metadata_df)

  metadata_df = pd.read_csv(file_path)


98638

In [5]:
# This dictionary has its keys as the names of the faulty Audiomoths, and the values as the recorded turn-on time
## datetime.datetime(year,month,day,hour,minute,second)
# https://www.geeksforgeeks.org/python-difference-between-two-dates-in-minutes-using-datetime-timedelta-method/
## TODO
# Readjust these to use the "Programming Time" Column, rather than the "File" column
## DONE
bad_am_dict = {"AM-20" : datetime.datetime(2019,6,9,9,23,0),
               "AM-3" :  datetime.datetime(2019,6,9,23,8,0),
               "AM-4" :  datetime.datetime(2019,6,9,23,16,0),
               "AM-5" :  datetime.datetime(2019,6,10,0,10,0),
               "AM-6" :  datetime.datetime(2019,6,9,22,6,0),
               "AM-7" :  datetime.datetime(2019,6,9,7,13,0),
               "AM-8" :  datetime.datetime(2019,6,9,23,23,0),
               "AM-9" :  datetime.datetime(2019,6,9,23,11,0),
               "AM-10" : datetime.datetime(2019,6,10,0,5,0),
               "AM-11" : datetime.datetime(2019,6,10,0,45,0),
               "AM-12" : datetime.datetime(2019,6,10,0,43,0),
               "AM-13" : datetime.datetime(2019,6,10,0,48,0),
               "AM-14" : datetime.datetime(2019,6,10,0,36,0),
               "AM-15" : datetime.datetime(2019,6,10,1,3,0),
               "AM-16" : datetime.datetime(2019,6,10,0,50,0),
               "AM-17" : datetime.datetime(2019,6,10,1,8,0),
               "AM-18" : datetime.datetime(2019,6,9,23,25,0)
# Two other faulty recordings, that didn't have an initial timestamp that I have received  
# I could potentially just use the first time on the deployment date, TBD
#               "AM-23" : datetime.datetime(2019),
#               "WWF-2" : datetime.datetime()
              }
# Creating a list from the dictionary keys 
bad_am_list = list(bad_am_dict.keys())

In [6]:
# Filtering out everything that isn't one of the Audiomoths listed as bad from the metadata
#metadata_df_reduced = metadata_df[metadata_df['AudioMothCode'].isin(bad_am_list)]
# Filtering out clips that had errors while recording or failed to receive a Microfaune score (faulty clips)
metadata_df_reduced = metadata_df[(metadata_df['Duration'] >= 60) & 
                        (metadata_df['Error'].isnull()) &
                        (metadata_df['FileSize'] >= 46080000)]
metadata_df_reduced = metadata_df_reduced[metadata_df_reduced["Global_Score"] != -1]

In [7]:
# Creating datetime objects from "Comment" column in "ExtractedTime" column
metadata_df_reduced['ExtractedTime'] = pd.to_datetime(metadata_df_reduced['Comment'].apply(lambda x: datetime.datetime.strptime(' '.join(x.split()[2:4]),'%H:%M:%S %d/%m/%Y')))
metadata_df_reduced.reset_index(drop=True,inplace=True)

In [8]:
# Initializing a dictionary to contain the timing corrections
timing_correction_df = {
    "AudioMoth"            : [],
    "FileName"             : [],
    "IncorrectDate"        : [],
    "IncorrectElapsedTime" : [],
    "CorrectElapsedTime"   : [],
    "CorrectDate"          : []
}
corrected_df = pd.DataFrame()
# Going through each device in the list
## TODO, rework so that the metadata is completely reworked to have the correct times all in one column.
# This will require you to parse the filename directly.
## DONE
for am in metadata_df_reduced["AudioMothCode"].unique():
    am_df = metadata_df_reduced[metadata_df_reduced["AudioMothCode"] == am]
    if am in bad_am_list:
        # Adding new columns
        am_df = am_df.reindex(columns=[*am_df.columns.tolist(), "IncorrectDate", "IncorrectElapsedTime", "CorrectElapsedTime", "CorrectDate"], fill_value=0)
        row_offset = am_df.index.values[0]
        # Looping through each row in the Audiomoth dataframe
        for row in am_df.index:
            incorrect_elapsed_time = am_df["ExtractedTime"][row]-bad_am_dict[am]
            # The metadata says that they are recording 1 minute, every 10 minutes
            # In reality, they were recording 1 minute, every 40 minutes
            corrected_elapsed_time = incorrect_elapsed_time * 4 
            # Pretty Printing
            # if am == "AM-3" and row % 200 == 0:
            #     print("\nAudioMoth Clip: ",am_df["FileName"][row])
            #     print("Incorrect Elapsed Time: ",incorrect_elapsed_time, "\nCorrected Elapsed Time",corrected_elapsed_time)
            #     print("Incorrect Data and Time", bad_am_dict[am]+incorrect_elapsed_time,"\nCorrected Date and Time: ", bad_am_dict[am]+corrected_elapsed_time)
            # Inserting into dictionary for long-term storage
            timing_correction_df["AudioMoth"].append(am_df["AudioMothCode"][row])
            timing_correction_df["FileName"].append(am_df["FileName"][row])
            timing_correction_df["IncorrectDate"].append(bad_am_dict[am]+incorrect_elapsed_time)
            timing_correction_df["IncorrectElapsedTime"].append(incorrect_elapsed_time)
            timing_correction_df["CorrectElapsedTime"].append(corrected_elapsed_time)
            timing_correction_df["CorrectDate"].append(bad_am_dict[am]+corrected_elapsed_time)
            # Modifying the dataframe directly
            am_df["IncorrectDate"][row] = bad_am_dict[am]+incorrect_elapsed_time
            am_df["IncorrectElapsedTime"][row] = incorrect_elapsed_time
            am_df["CorrectElapsedTime"][row] = corrected_elapsed_time
            am_df["CorrectDate"][row] = bad_am_dict[am]+corrected_elapsed_time
    else:
        # Adding in correct elapsed time and date for non-faulty devices
        am_df["CorrectElapsedTime"] = am_df["ExtractedTime"]-am_df["ExtractedTime"].iloc[0]
        am_df["CorrectDate"] = am_df["ExtractedTime"]
    
    # Adding Audiomoths to final DataFrame
    if corrected_df.empty:
        corrected_df = am_df
    else: 
        corrected_df = pd.concat([corrected_df,am_df])
display(corrected_df)

Unnamed: 0,AudioMothCode,AudioMothID,SourceFile,Directory,FileName,FileSize,Encoding,NumChannels,SampleRate,AvgBytesPerSec,...,FileCreateDate,FileType,FileTypeExtension,MIMEType,Global_Score,ExtractedTime,CorrectElapsedTime,CorrectDate,IncorrectDate,IncorrectElapsedTime
0,AM-1,243B1F055B2BEAB8,GRABADOR-SDZG-AM-1/20190616_150000.WAV,GRABADOR-SDZG-AM-1,20190616_150000.WAV,46080360,1.0,1.0,384000.0,768000.0,...,2019:12:20 03:13:11-08:00,WAV,WAV,audio/x-wav,0.024959,2019-06-16 15:00:00,0 days 00:00:00,2019-06-16 15:00:00,,
1,AM-1,243B1F055B2BEAB8,GRABADOR-SDZG-AM-1/20190616_151000.WAV,GRABADOR-SDZG-AM-1,20190616_151000.WAV,46080360,1.0,1.0,384000.0,768000.0,...,2019:12:20 03:13:13-08:00,WAV,WAV,audio/x-wav,0.777727,2019-06-16 15:10:00,0 days 00:10:00,2019-06-16 15:10:00,,
2,AM-1,243B1F055B2BEAB8,GRABADOR-SDZG-AM-1/20190616_152000.WAV,GRABADOR-SDZG-AM-1,20190616_152000.WAV,46080360,1.0,1.0,384000.0,768000.0,...,2019:12:20 03:13:15-08:00,WAV,WAV,audio/x-wav,0.685252,2019-06-16 15:20:00,0 days 00:20:00,2019-06-16 15:20:00,,
3,AM-1,243B1F055B2BEAB8,GRABADOR-SDZG-AM-1/20190616_153000.WAV,GRABADOR-SDZG-AM-1,20190616_153000.WAV,46080360,1.0,1.0,384000.0,768000.0,...,2019:12:20 03:13:16-08:00,WAV,WAV,audio/x-wav,0.949008,2019-06-16 15:30:00,0 days 00:30:00,2019-06-16 15:30:00,,
4,AM-1,243B1F055B2BEAB8,GRABADOR-SDZG-AM-1/20190616_154000.WAV,GRABADOR-SDZG-AM-1,20190616_154000.WAV,46080360,1.0,1.0,384000.0,768000.0,...,2019:12:20 03:13:18-08:00,WAV,WAV,audio/x-wav,0.036467,2019-06-16 15:40:00,0 days 00:40:00,2019-06-16 15:40:00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93685,5,,GRABADOR-WWF-5/5D2FC490.WAV,GRABADOR-WWF-5,5D2FC490.WAV,46080192,1.0,1.0,384000.0,768000.0,...,2019:12:07 17:19:33-08:00,WAV,WAV,audio/x-wav,0.041479,2019-07-18 01:00:00,1 days 06:50:00,2019-07-18 01:00:00,,
93686,5,,GRABADOR-WWF-5/5D2FCB98.WAV,GRABADOR-WWF-5,5D2FCB98.WAV,46080192,1.0,1.0,384000.0,768000.0,...,2019:12:07 17:19:38-08:00,WAV,WAV,audio/x-wav,0.024111,2019-07-18 01:30:00,1 days 07:20:00,2019-07-18 01:30:00,,
93687,5,,GRABADOR-WWF-5/5D2FCDF0.WAV,GRABADOR-WWF-5,5D2FCDF0.WAV,46080192,1.0,1.0,384000.0,768000.0,...,2019:12:07 17:19:43-08:00,WAV,WAV,audio/x-wav,0.011412,2019-07-18 01:40:00,1 days 07:30:00,2019-07-18 01:40:00,,
93688,5,,GRABADOR-WWF-5/5D2FD750.WAV,GRABADOR-WWF-5,5D2FD750.WAV,46080192,1.0,1.0,384000.0,768000.0,...,2019:12:07 17:19:49-08:00,WAV,WAV,audio/x-wav,0.042102,2019-07-18 02:20:00,1 days 08:10:00,2019-07-18 02:20:00,,


In [9]:
corrected_df.reset_index(drop=True,inplace=True)
corrected_df.to_csv("FaultyTimingAMs_Corrected_Full.csv", index=False)