In [1]:
import gcsfs
import pandas as pd

The full 50 GB data is mostly composed of 2h long recordings. Unfortuantely `warbleR` (as many R packages) seems to have problems with memory management, rendering parallel processing of these files on 32 GB machine impossible. For this reason the files have been split into 10 min long pieces and each piece given an index, integer ranging from `1` to `N`. For instance, file `STHELENA-02_20140516_200000` would be divided into:

```
STHELENA-02_20140516_200000_001.wav
STHELENA-02_20140516_200000_002.wav
...

STHELENA-02_20140516_200000_012.wav
```

The side effect is that `warbleR` will give us `start` and `end` of each call with reference to the new file naming convention and all calls will have be between `0` and `600s`. We need to fix it by adding an offset.

In [2]:
fs = gcsfs.GCSFileSystem(project='birdman-project')
with fs.open('storm-petrels/full-set-50GB/calls.csv') as f:
    df = pd.read_csv(f)

In [3]:
df.head()

Unnamed: 0,sound.files,selec,start,end
0,STHELENA-02_20140516_200000_004,1,17.415189,17.753002
1,STHELENA-02_20140516_200000_004,2,17.815502,17.985752
2,STHELENA-02_20140516_200000_004,3,19.453127,19.904315
3,STHELENA-02_20140516_200000_004,4,19.91219,20.490565
4,STHELENA-02_20140516_200000_004,5,20.595752,21.009127


In [4]:
def adjust_time(row):
    name = row['sound.files']
    start = row['start']
    end = row['end']
    idx = int(name[name.rfind('_')+1:])
    start += (idx - 1) * 600
    end += (idx - 1) * 600
    return pd.Series((start, end))

In [5]:
df[['start_adjusted', 'end_adjusted']] = df.apply(adjust_time, axis=1)

In [6]:
df.head()

Unnamed: 0,sound.files,selec,start,end,start_adjusted,end_adjusted
0,STHELENA-02_20140516_200000_004,1,17.415189,17.753002,1817.415189,1817.753002
1,STHELENA-02_20140516_200000_004,2,17.815502,17.985752,1817.815502,1817.985752
2,STHELENA-02_20140516_200000_004,3,19.453127,19.904315,1819.453127,1819.904315
3,STHELENA-02_20140516_200000_004,4,19.91219,20.490565,1819.91219,1820.490565
4,STHELENA-02_20140516_200000_004,5,20.595752,21.009127,1820.595752,1821.009127
