In [None]:
import configparser
import numpy as np
import pandas as pd
import soundfile as sf
import sys
import torch
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from scipy.signal import find_peaks
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from src.dataset import BirdsongDataset
from src.network import AutoEncoderClassifier
from src.utils import GetSortedSpeciesCode

In [None]:
config = configparser.ConfigParser()
config.read(str(Path.cwd().parent.parent.joinpath('setting', 'config.ini')))

WIN_LEN = config['Window'].getint('Length')
HOP_LEN = WIN_LEN * (1 - config['Window'].getfloat('Overlap'))
TARGET_SPECIES = GetSortedSpeciesCode(Path.cwd().parent.parent.joinpath('setting', 'SPECIES.csv'))
THRESHOLD = config['Application']['Threshold'].split(',')

torch.manual_seed(42)
if torch.cuda.is_available():
  DEVICE = torch.device(f'cuda:{config["Model"]["Classifier_Device"]}')
  torch.backends.cudnn.benchmark = True
else:
  DEVICE = torch.device('cpu')

In [None]:
def getProbabilityResults(weightPath:Path):
  model = AutoEncoderClassifier(len(TARGET_SPECIES)).to(DEVICE)
  model.load_state_dict(torch.load(weightPath, map_location=torch.device(DEVICE)))

  allDataloader = DataLoader(
    BirdsongDataset(Path.cwd().parent.parent.joinpath('data', 'tmp', 'oneMin-test.csv'), False, False),
    batch_size=4, shuffle=False, num_workers=4, pin_memory=True
  )

  predicts = []
  model.eval()
  with torch.no_grad():
    for _, (inputs, _) in tqdm(enumerate(allDataloader), total=len(allDataloader)):
      inputs = inputs.to(DEVICE)
      outputs = F.sigmoid(model(inputs))
      predicts.extend(outputs.cpu().numpy())
  predicts = np.array(np.reshape(predicts, (-1, len(TARGET_SPECIES))))
  return predicts

In [None]:
def countFileLabels(filePaths:Path):
  """
    計算人工標記檔案每一物種標籤數
  """
  df = pd.DataFrame(columns=['file']+TARGET_SPECIES)
  for i, filePath in enumerate(filePaths):
    df.loc[i, 'file'] = Path('NrAudio', f'{filePath.stem}.wav')
    
    labelDF = pd.read_csv(filePath, sep='\t', names=['st', 'et', 'species'])
    labelDF['species'] = labelDF['species'].str.upper().replace(' ', '')
    labelDF = labelDF[labelDF['species'].str.contains('-S+', regex=True, na=False)]
    labelDF.reset_index(drop=True, inplace=True)
    labelDF['species'] = labelDF['species'].apply(lambda x: str(x).split('-')[0])
    labelDF = labelDF[labelDF['species'].apply(lambda x: x in TARGET_SPECIES)] # Select TARGET_SPECIES

    if labelDF.empty:
      continue
    
    vcDict = labelDF['species'].value_counts()
    for k, v in vcDict.items():
      df.loc[i, k] = v

  df.fillna(0, inplace=True)
  df.set_index('file', inplace=True)
  return df

In [None]:
labelPaths = sorted(Path.cwd().parent.parent.joinpath('data', 'Label').glob('*.txt'))

chunk = pd.read_csv(Path.cwd().parent.parent.joinpath('data', 'ae-dataset.csv'), header=0, chunksize=100000)
aeDF = pd.concat(chunk)
aeDF['file'] = aeDF['file'].apply(lambda x: Path(x).stem)
aeDF = aeDF[aeDF['file'].isin([p.stem for p in labelPaths])]
aeDF['file'] = aeDF['file'].apply(lambda x: Path('NrAudio', f'{x}.wav'))
aeDF.to_csv(Path.cwd().parent.parent.joinpath('data', 'tmp', 'oneMin-test.csv'), header=True, index=False)

In [None]:
weightPath = Path.cwd().parent.parent.joinpath('model', 'AEClassifier20220626.pth')   # Select model weight manually
predicts = getProbabilityResults(weightPath)
predicts = np.array(np.reshape(predicts, (-1, len(TARGET_SPECIES))))

In [None]:
groupData = pd.DataFrame(predicts, index=aeDF['file'], columns=TARGET_SPECIES).groupby(by='file')
predictsDF = pd.DataFrame(0, index=aeDF['file'].unique(), columns=TARGET_SPECIES)
for file, group in tqdm(groupData):
  for i, sp in enumerate(TARGET_SPECIES):
    peaks, _ = find_peaks(group[sp], height=float(THRESHOLD[i]))
    predictsDF.loc[file, sp] = len(peaks)

actualsDF = countFileLabels(labelPaths)

In [None]:
count = []
for i in range(len(labelPaths)):
  setList = list(zip(actualsDF.iloc[i, :].to_list(), predictsDF.iloc[i, :].to_list()))
  count.append([element for subList in setList for element in subList])

countDFCol = pd.MultiIndex.from_product([TARGET_SPECIES, ['actual', 'predict']])
countDF = pd.DataFrame.from_records(count, index=[p.stem for p in labelPaths], columns=countDFCol)
countDF.to_csv(Path.cwd().parent.parent.joinpath('report', 'table', 'songCount.csv'), header=True, index=True)