# Preprocessing

In [1]:
import configparser
import librosa
import noisereduce as nr
import numpy as np
import pandas as pd
import soundfile as sf
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from tqdm import tqdm_notebook

from src.utils import SegmentWithSlidingWindow, GetSortedSpeciesCode, OneHotEncoding

In [2]:
config = configparser.ConfigParser()
config.read(str(Path.cwd().parent.joinpath('setting', 'config.ini')))

WIN_LEN = config['Window'].getint('Length')
HOP_LEN = WIN_LEN * (1 - config['Window'].getfloat('Overlap'))
TARGET_SPECIES = GetSortedSpeciesCode(Path.cwd().parent.joinpath('setting', 'SPECIES.csv'))

## Reducing noise

1. 取得所有未降噪音檔路徑
2. 讀取未降噪音檔, 標準化音檔, 最後使用 [noisereduce](https://github.com/timsainb/noisereduce) 進行降噪 
3. 輸出降噪後檔案於 "_pwd/../data/NrAudio/_"
4. 刪除未降噪檔案

In [3]:
def reduceNoise(filePaths:Path, remove:bool=True):
  for filePath in tqdm_notebook(filePaths):
    audio, sr = librosa.load(str(filePath), sr=None)
    audio = librosa.util.normalize(audio.T)
    nrAudio = nr.reduce_noise(y=audio, sr=sr, use_tqdm=True)
    sf.write(
      Path.cwd().parent.joinpath('data', 'NrAudio', f'{filePath.stem}.wav'),
      data=nrAudio, samplerate=sr, subtype='PCM_24'
    )
    if remove:
      Path.unlink(filePath)

In [4]:
rawAudioPaths = sorted(Path.cwd().parent.joinpath('data', 'Audio').glob('*.wav'))
openAudioPaths = \
  sorted(Path.cwd().parent.joinpath('data', 'OpenSource').glob('*.mp3')) + \
  sorted(Path.cwd().parent.joinpath('data', 'OpenSource').glob('*.wav'))

reduceNoise(rawAudioPaths, remove=True)   # reduce noise and delete raw files
reduceNoise(openAudioPaths, remove=False) # reduce noise and keep raw files

0it [00:00, ?it/s]

  0%|          | 0/544 [00:00<?, ?it/s]

## Generating Auto-encoder Classifier dataset

將人工標記檔案細切，以取得乾淨、較高品質資料

1. 去除非屬song-type之鳥音
2. 將剩餘鳥音以適當演算法進行切割
3. 切除後鳥音如大於 WIN_LEN 則進行滑移視窗處理, 如否則不處理
4. 保留資料時長大於等於0.5秒標記

In [5]:
def segmentSignalLabel(df:pd.DataFrame):
  records = []
  labels = []
  for i, x in df.iterrows():
    records.append([x['start time'], 'L', i])
    records.append([x['end time'], 'R', i])
    labels.append(x['label'])
  # Sort by time
  records = sorted(records)

  overlap = []
  results = []
  for j, record in enumerate(records):
    if record[1] == 'L':
      if overlap: # Overlap means get L but previous L' aren't finished by its R'
        labelList = [label for k, label in enumerate(labels) if k in overlap]
        labelList = set(labelList)  # Use set to remove identical label
        results.append([records[j-1][0], record[0], labelList])
        overlap.append(record[2])
      else:
        overlap.append(record[2])
    else:
      labelList = [label for k, label in enumerate(labels) if k in overlap]
      labelList = set(labelList)
      results.append([records[j-1][0], record[0], labelList])
      overlap.remove(record[2])
  return results

In [6]:
def generateClassifierDataset(filePaths:Path, outputFilename:str):
  df = pd.DataFrame(columns=['file', 'start time', 'end time', 'label'])
  for filePath in tqdm_notebook(filePaths):
    # Preporcess
    singleDF = pd.read_csv(filePath, sep='\t', names=['start time', 'end time', 'label'])
    singleDF['label'] = singleDF['label'].str.upper().replace(' ', '')
    singleDF = singleDF[singleDF['label'].str.contains('-S+', regex=True, na=False)]
    singleDF.reset_index(drop=True, inplace=True)
    singleDF['label'] = singleDF['label'].apply(lambda x: str(x).split('-')[0])
    
    if singleDF.empty:
      continue

    # Segment signal interval
    segmentedDF = pd.DataFrame(segmentSignalLabel(singleDF), columns=['start time', 'end time', 'label'])
    source = sf.SoundFile(Path.cwd().parent.joinpath('data', 'NrAudio', f'{filePath.stem}.wav'))
    segmentedDF['end time'] = segmentedDF['end time'].apply(lambda x: min(x, source.frames / source.samplerate))
    segmentedDF['file'] = Path('NrAudio', f'{filePath.stem}.wav')
    segmentedDF = segmentedDF[['file', 'start time', 'end time', 'label']]

    # Sliding window
    shortDF = segmentedDF[segmentedDF['end time'] - segmentedDF['start time'] <= WIN_LEN]   # Duration less and equal than {WIN_LEN}
    longDF = segmentedDF[segmentedDF['end time'] - segmentedDF['start time'] > WIN_LEN]     # Duration greater than {WIN_LEN}
    for _, x in longDF.iterrows():
      st, et = x['start time'], x['end time'] - WIN_LEN
      slidingWindow = []
      while st <= et:
        slidingWindow.append([x['file'], np.around(st, decimals=6), np.around(st + WIN_LEN, decimals=6), x['label']])
        st += HOP_LEN
      shortDF = pd.concat(
        [shortDF, pd.DataFrame(slidingWindow, columns=['file', 'start time', 'end time', 'label'])],
        ignore_index=True
      )

    # Filter
    shortDF = shortDF[shortDF['end time'] - shortDF['start time'] > 0.5]    # Only duration > 0.5 are accept
    shortDF.sort_values(by=['file', 'start time', 'end time'], inplace=True)
    shortDF['label'] = shortDF['label'].apply(lambda x: sorted(x))

    # Concatenate
    df = pd.concat([df, shortDF], ignore_index=True)
  
  # Save results
  df['label'] = df['label'].apply(lambda x: ','.join(x))
  df.to_csv(Path.cwd().parent.joinpath('data', f'{outputFilename}.csv'), header=True, index=False)

In [7]:
selfLabelPaths = sorted(Path.cwd().parent.joinpath('data', 'Label').glob('*.txt'))
openLabelPaths = sorted(Path.cwd().parent.joinpath('data', 'OpenSource').glob('*.txt'))

generateClassifierDataset(selfLabelPaths, 'manual-dataset')
generateClassifierDataset(openLabelPaths, 'opensource-dataset')

  0%|          | 0/1243 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

## Generating Auto-encoder dataset

以所有已降噪音訊製作 auto-encoder 資料集

1. 讀取音檔並取得其播放長度
2. 以滑移視窗的方式切割出資料集

In [8]:
def generateAEDataset(filePaths:Path):
  ae = []
  for filePath in tqdm_notebook(filePaths):
    source = sf.SoundFile(filePath)
    stList = SegmentWithSlidingWindow(
      length=source.frames / source.samplerate,
      windowLength=WIN_LEN, hopLength=HOP_LEN
    )
    for st in stList:
      ae.append([Path('NrAudio', f'{filePath.stem}.wav'), st, st + WIN_LEN])
  
  df = pd.DataFrame(ae, columns=['file', 'start time', 'end time'])
  df.to_csv(Path.cwd().parent.joinpath('data', f'ae-dataset.csv'), header=True, index=False)

In [9]:
nrAudioPaths = sorted(Path.cwd().parent.joinpath('data', 'NrAudio').glob('*.wav'))
generateAEDataset(nrAudioPaths)

  0%|          | 0/19120 [00:00<?, ?it/s]

## Separating dataset for model

共 3 個資料集, 第一個為自動標記資料集, 下一個為人工標記資料集, 最後是開源資料集。  
目前將所有資料集丟入 Model 進行訓練, 唯開源資料集不進入 Model 測試

1. 決定切割比例
2. 將 label 以 onehot 模式表示
3. 切割資料集為 train, validate, test 三群 (Note: auto-encoder 無 test)
4. 儲存資料進暫時資料夾 "_pwd/../data/tmp/_"

In [10]:
classifierRatio = (0.8, 0.1, 0.1)
autoencoderRatio = (0.9, 0.1)

In [11]:
def filterTarget(df:pd.DataFrame):
  df['label'] = df['label'].apply(lambda x: [l for l in x.split(',') if l in TARGET_SPECIES])
  df['onehot'] = df['label'].apply(lambda x: pd.NA if len(x) == 0 else OneHotEncoding(x, TARGET_SPECIES))
  df.dropna(inplace=True)
  # df[TARGET_SPECIES] = pd.DataFrame(df['code'].tolist(), index=df.index)  # Trans 'code' to specific species
  df = df[['file', 'start time', 'end time', 'onehot']]
  return df

In [12]:
autoDF = filterTarget(pd.read_csv(Path.cwd().parent.joinpath('data', f'auto-dataset.csv'), header=0))
selfDF = filterTarget(pd.read_csv(Path.cwd().parent.joinpath('data', f'manual-dataset.csv'), header=0))
openDF = filterTarget(pd.read_csv(Path.cwd().parent.joinpath('data', f'opensource-dataset.csv'), header=0))

# Split <train, validate, test> = <0.8 : 0.1 : 0.1>
# np.split |------------|--------------|----------|
#          0   train   0.8  validate  0.9  test  1.0
#          |     8      :      1       :     1    |
# 8 : 1 : 1 -> cut point 0.8 * total length, and 0.9 * total length

autoTrainDF, autoValidateDF, autoTestDF = np.split(autoDF.sample(frac=1), 
  [int(classifierRatio[0] * len(autoDF)), int((classifierRatio[0] + classifierRatio[1]) * len(autoDF))]
)
selfTrainDF, selfValidateDF, selfTestDF = np.split(selfDF.sample(frac=1), 
  [int(classifierRatio[0] * len(selfDF)), int((classifierRatio[0] + classifierRatio[1]) * len(selfDF))]
)
aecTrainDF = pd.concat([autoTrainDF, selfTrainDF, openDF], ignore_index=True)
aecValidateDF = pd.concat([autoValidateDF, selfValidateDF], ignore_index=True)
aecTestDF = pd.concat([autoTestDF, selfTestDF], ignore_index=True)
aecTrainDF.to_csv(Path.cwd().parent.joinpath('data', 'tmp', f'aec-train.csv'), header=True, index=False)
aecValidateDF.to_csv(Path.cwd().parent.joinpath('data', 'tmp', f'aec-validate.csv'), header=True, index=False)
aecTestDF.to_csv(Path.cwd().parent.joinpath('data', 'tmp', f'aec-test.csv'), header=True, index=False)

In [13]:
aeDF = pd.read_csv(Path.cwd().parent.joinpath('data', f'ae-dataset.csv'), header=0)
aeTrainDF, aeValidateDF = np.split(aeDF.sample(frac=1), [int(autoencoderRatio[0] * len(aeDF))])
aeTrainDF.to_csv(Path.cwd().parent.joinpath('data', 'tmp', f'ae-train.csv'), header=True, index=False)
aeValidateDF.to_csv(Path.cwd().parent.joinpath('data', 'tmp', f'ae-validate.csv'), header=True, index=False)