In [1]:
import pandas as pd
from datetime import datetime

In [2]:
path = "Scripps_Microfaune_Global_Scores.csv"

In [3]:
def stratify_datafile(filepath,destination_filepath, count = 1):    
    df = pd.read_csv(filepath)
    df = df[(df['AudioMothCode'].notna()) & 
            (df['Duration'].notna()) & 
            (df['Comment'].notna()) & 
            (df['FileSize'].notna()) &
            (df['Duration'] >= 60.0) &
            (df['FileSize'] >= 40000000)]
    
    df['hour'] = df['Comment'].apply(lambda x: datetime.strptime(' '.join(x.split()[2:4]), 
                                                                 '%H:%M:%S %d/%m/%Y'))\
                              .dt\
                              .hour\
                              .tolist()
    
    df = df.groupby(["AudioMothCode", "hour"])\
           .apply(lambda x: x.sample(count))\
           .drop(columns='hour')
    
    if df.size > 0:
        df.to_csv(destination_filepath, index=False)
        return True
    return False

In [4]:
stratify_datafile(path,"Stratified_Random_Sample.csv")
stratify_datafile(path,"Stratified_Random_Sample_DawnDusk.csv",4)

True

In [5]:
def stratify_datafile_skewed(filepath,destination_filepath, count = 1):    
    df = pd.read_csv(filepath)
    df = df[(df['AudioMothCode'].notna()) & 
            (df['Duration'].notna()) & 
            (df['Comment'].notna()) & 
            (df['FileSize'].notna()) &
            (df['Duration'] >= 60.0) &
            (df['FileSize'] >= 40000000) &
            (df['GlobalScore'] >= 0.50)]
    
    df['hour'] = df['Comment'].apply(lambda x: datetime.strptime(' '.join(x.split()[2:4]), 
                                                                 '%H:%M:%S %d/%m/%Y'))\
                              .dt\
                              .hour\
                              .tolist()
    
    df = df.groupby(["AudioMothCode", "hour"])\
           .apply(lambda x: x.sample(count))\
           .drop(columns='hour')
    
    if df.size > 0:
        df.to_csv(destination_filepath, index=False)
        return True
    return False

In [6]:
# The first Skewed Datapath
stratify_datafile_skewed(path,"Scripps_Microfaune_Global_Scores_Stratified_Skewed.csv")

True

In [7]:
# We are defining Dawn as starting at 5:50 AM ==> 5.83 and ending at 9:40 AM ==> 9.67
# We are defining Dusk as start at 5:40 PM ==> 17.67 and ending at 8 PM 20.00

In [9]:
stratify_datafile_skewed(path,"Scripps_Microfaune_Global_Scores_Stratified_DawnDusk.csv",3)
# Skewed + DawnDusk

True

In [10]:
# Vanilla + Dawndusk Post-processing
dawndusk_df1 = pd.read_csv("Stratified_Random_Sample_DawnDusk.csv")
test1 = dawndusk_df1[ ( (dawndusk_df1["Time (PDT)"] >= 5.8) & (dawndusk_df1["Time (PDT)"] <= 9.7) ) | ( (dawndusk_df1["Time (PDT)"] >= 17.6) & (dawndusk_df1["Time (PDT)"] <= 20.1) )]

In [12]:
# This one is good to go, I just need to sample 240 from it to reduce down from ~260 that it is currently at
#test1.to_csv("Stratified_Random_Sample_Vanilla_DawnDusk.csv",index=False)

In [13]:
# Skewed + Dawndusk Post-processing
dawndusk_df2 = pd.read_csv("Scripps_Microfaune_Global_Scores_Stratified_Skewed.csv")
test2 = dawndusk_df2[ ( (dawndusk_df2["Time (PDT)"] >= 5.8) & (dawndusk_df2["Time (PDT)"] <= 9.7) ) | ( (dawndusk_df2["Time (PDT)"] >= 17.6) & (dawndusk_df2["Time (PDT)"] <= 20.1) )]

In [14]:
test2

Unnamed: 0,AudioMothCode,AudioMothID,SourceFile,Directory,FileName,FileSize,SampleRate,Bitrate,BitsPerSample,Duration,Comment,Latitude,Longitude,GlobalScore,Time (PDT)
1,OFF0,AudioMoth 242A260460372176,OFF0/20210820013000.WAV,OFF0,20210820013000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 01:30:00 20/08/2021 (UTC) by Audio...,32.87618,117.24717,0.981866,18.500000
2,OFF0,AudioMoth 242A260460372176,OFF0/20210813023000.WAV,OFF0,20210813023000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 02:30:00 13/08/2021 (UTC) by Audio...,32.87618,117.24717,0.975981,19.500000
3,OFF0,AudioMoth 242A260460372176,OFF0/20210816030000.WAV,OFF0,20210816030000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 03:00:00 16/08/2021 (UTC) by Audio...,32.87618,117.24717,0.630479,20.000000
13,OFF0,AudioMoth 242A260460372176,OFF0/20210819134000.WAV,OFF0,20210819134000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 13:40:00 19/08/2021 (UTC) by Audio...,32.87618,117.24717,0.963063,6.666667
14,OFF0,AudioMoth 242A260460372176,OFF0/20210813143000.WAV,OFF0,20210813143000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 14:30:00 13/08/2021 (UTC) by Audio...,32.87618,117.24717,0.993789,7.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,OTH1,AudioMoth 242A2604603736FD,OTH1/20210814024000.WAV,OTH1,20210814024000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 02:40:00 14/08/2021 (UTC) by Audio...,32.87637,117.24926,0.994294,19.666667
227,OTH1,AudioMoth 242A2604603736FD,OTH1/20210821134000.WAV,OTH1,20210821134000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 13:40:00 21/08/2021 (UTC) by Audio...,32.87637,117.24926,0.988743,6.666667
228,OTH1,AudioMoth 242A2604603736FD,OTH1/20210817145000.WAV,OTH1,20210817145000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 14:50:00 17/08/2021 (UTC) by Audio...,32.87637,117.24926,0.670358,7.833333
229,OTH1,AudioMoth 242A2604603736FD,OTH1/20210816152000.WAV,OTH1,20210816152000.WAV,46080488,384000.0,6000.0,16,60.0,Recorded at 15:20:00 16/08/2021 (UTC) by Audio...,32.87637,117.24926,0.992499,8.333333


In [19]:
# We are defining Dawn as starting at 5:50 AM ==> 5.83 and ending at 9:40 AM ==> 9.67
# We are defining Dusk as start at 5:40 PM ==> 17.67 and ending at 8 PM 20.00
def stratify_datafile_skewed_dawndusk(filepath,destination_filepath, count = 1):    
    df = pd.read_csv(filepath)
    df = df[(df['AudioMothCode'].notna()) & 
            (df['Duration'].notna()) & 
            (df['Comment'].notna()) & 
            (df['FileSize'].notna()) &
            (df['Duration'] >= 60.0) &
            (df['FileSize'] >= 40000000) &
            (df['GlobalScore'] >= 0.50) &
            ( ( (df["Time (PDT)"] >= 5.8) & (df["Time (PDT)"] <= 9.7)) | ( (df["Time (PDT)"] >= 17.6) & (df["Time (PDT)"] <= 20.01)) )]
    
    df['hour'] = df['Comment'].apply(lambda x: datetime.strptime(' '.join(x.split()[2:4]), 
                                                                 '%H:%M:%S %d/%m/%Y'))\
                              .dt\
                              .hour\
                              .tolist()
    
    df = df.groupby(["AudioMothCode", "hour"])\
           .apply(lambda x: x.sample(count))\
           .drop(columns='hour')
    
    if df.size > 0:
        df.to_csv(destination_filepath, index=False)
        return True
    return False

In [21]:
stratify_datafile_skewed_dawndusk(path,"Stratified_skewed_Dawndusk2.csv",2)

True

In [22]:
# breaking down the skewed dawn-dusk into 240 clips
skewed_dawndusk_df = pd.read_csv("Skewed_and_DawnDusk.csv")

In [23]:
skewed_dawndusk_reduced_df = skewed_dawndusk_df.sample(240)

In [25]:
skewed_dawndusk_reduced_df.to_csv("Skewed_Dawndusk_Final.csv",index=False)

In [26]:
# Breaking down the dawn-dusk into 240 clips
dawndusk_df = pd.read_csv("Stratified_Random_Sample_Vanilla_DawnDusk.csv")
dawndusk_reduced_df = dawndusk_df.sample(240)
dawndusk_reduced_df.to_csv("Dawndusk_Final.csv",index=False)