# Auto-encoder Classifier

In [None]:
import configparser
import numpy as np
import pandas as pd
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from sklearn.metrics import precision_score, recall_score, fbeta_score
from torch.utils.data import DataLoader
from tkinter import *
from tkinter.filedialog import askopenfilename
from tqdm.notebook import tqdm

from src.dataset import BirdsongDataset
from src.network import AutoEncoderClassifier
from src.utils import CalculateImbalanceWeight, GetSortedSpeciesCode

In [None]:
config = configparser.ConfigParser()
config.read(str(Path.cwd().parent.parent.joinpath('setting', 'config.ini')))

EPOCHS = config['Model'].getint('Epochs')
BATCH_SIZE = config['Model'].getint('BatchSize')
LEARNING_RATE = config['Model'].getfloat('LearningRate')
EARLY_STOP = config['Model'].getint('EarlyStop')

torch.manual_seed(42)
if torch.cuda.is_available():
  DEVICE = torch.device(f'cuda:{config["Model"]["Classifier_Device"]}')
  torch.backends.cudnn.benchmark = True
else:
  DEVICE = torch.device('cpu')

TARGET_SPECIES = GetSortedSpeciesCode(Path.cwd().parent.parent.joinpath('setting', 'SPECIES.csv'))
IMBALANCE_WEIGHT = CalculateImbalanceWeight(
  Path.cwd().parent.parent.joinpath('data', 'tmp', 'aec-train.csv'), weightType='ens'
)

## Training

In [None]:
encoderWeightPath = Path.cwd().parent.parent.joinpath('model', 'AE20220706_encoder.pth')  # Manual change weight path
modelWeightPath = Path.cwd().parent.parent.joinpath('model', f'AEClassifier{datetime.now().strftime("%Y%m%d")}.pth')

model = AutoEncoderClassifier(numberOfClass=len(TARGET_SPECIES)).to(DEVICE)               # @AutoEncoderClassifier need @numberOfClass as input
model.encoder.load_state_dict(torch.load(encoderWeightPath, map_location=DEVICE))
for param in model.encoder.parameters():
  param.requires_grad = False
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
criterion = nn.BCEWithLogitsLoss(IMBALANCE_WEIGHT).to(DEVICE)                             # Use binary cross entropy with log as loss fuction
                                                                                          # We use log to avoid unstable situation
bestLoss = np.Inf
earlyCount = 0

In [None]:
aecTrainDataloader = DataLoader(
  BirdsongDataset(Path.cwd().parent.parent.joinpath('data', 'tmp', 'aec-train.csv'), needAugment=True, needLabel=True),
  batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True
)
aecValidateDataloader = DataLoader(
  BirdsongDataset(Path.cwd().parent.parent.joinpath('data', 'tmp', 'aec-validate.csv'), needAugment=False, needLabel=True),
  batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True
)

In [None]:
for epoch in tqdm(range(EPOCHS)):
  # Train
  model.train()
  trainingLoss = 0.0
  for _, (inputs, labels) in tqdm(enumerate(aecTrainDataloader), total=len(aecTrainDataloader)):
    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    trainingLoss += loss.item()
  trainingLoss /= len(aecTrainDataloader)

  # Validate 
  model.eval()
  validationLoss = 0.0
  with torch.no_grad():
    for _, (inputs, labels) in tqdm(enumerate(aecValidateDataloader), total=len(aecValidateDataloader)):
      inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      validationLoss += loss.item()
  validationLoss /= len(aecValidateDataloader)

  # Check loss
  if validationLoss < bestLoss:
    bestLoss = validationLoss
    earlyCount = 0
    torch.save(model.state_dict(), modelWeightPath)
  else:
    earlyCount += 1
    if earlyCount >= EARLY_STOP:
      break

  # Print results
  print(f"""
    >> [{epoch + 1} / {EPOCHS}] ~~ ~~ AutoEncodeClassifer
    >> {"Best V Loss :":>16} {bestLoss} + [{earlyCount}]
    >> {"Current T Loss :":>16} {trainingLoss:6f}
    >> {"Current V Loss :":>16} {validationLoss:6f}
  """)

## Testing

In [None]:
root = Tk()
root.withdraw()
modelWeightPath = askopenfilename(
  title='Choose The File Of Model Weight', 
  initialdir=Path.cwd().parent.parent.joinpath('model')
)
root.destroy()

model = AutoEncoderClassifier(numberOfClass=len(TARGET_SPECIES)).to(DEVICE)
model.load_state_dict(torch.load(modelWeightPath, map_location=torch.device(DEVICE)))

In [None]:
aecTestDataloader = DataLoader(
  BirdsongDataset(Path.cwd().parent.parent.joinpath('data', 'tmp', 'aec-test.csv'), needAugment=False, needLabel=True),
  batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True
)

In [None]:
predicts, actuals = [], []
model.eval()
with torch.no_grad():
  for _, (inputs, labels) in tqdm(enumerate(aecTestDataloader), total=len(aecTestDataloader)):
    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
    outputs = F.sigmoid(model(inputs))
    predicts.extend(outputs.cpu().numpy())
    actuals.extend(labels.cpu().numpy())

In [None]:
statisticDFIndex = pd.MultiIndex.from_product([TARGET_SPECIES, ['precision', 'recall', 'f0.5', 'f1', 'f2']])
thresList = np.around(np.arange(0, 1, 0.01), decimals=2)
statisticDF = pd.DataFrame(columns=thresList, index=statisticDFIndex)

# @predicts and @actuals in a 1-D array, thus we need to reshape it to 2-D array, which each column is our @TARGET_SPECIES
trueLabel = np.array(np.reshape(actuals, (-1, len(TARGET_SPECIES))), dtype=int)
# Result will be formated in four-decimal
for thres in thresList:
  predictLabel = np.array(np.reshape(predicts, (-1, len(TARGET_SPECIES)))>= thres, dtype=int)
  for i, sp in enumerate(TARGET_SPECIES):
    # Precision calculation
    statisticDF.loc[(sp, 'precision'), thres] = np.round(
      precision_score(y_pred=predictLabel[:, i], y_true=trueLabel[:, i], zero_division=0), decimals=4
    )
    # Recall calculation
    statisticDF.loc[(sp, 'recall'), thres] = np.round(
      recall_score(y_pred=predictLabel[:, i], y_true=trueLabel[:, i], zero_division=0), decimals=4
    )
    # F0.5
    statisticDF.loc[(sp, 'f0.5'), thres] = np.round(
      fbeta_score(y_pred=predictLabel[:, i], y_true=trueLabel[:, i], zero_division=0, beta=0.5), decimals=4
    )
    # F1
    statisticDF.loc[(sp, 'f1'), thres] = np.round(
      fbeta_score(y_pred=predictLabel[:, i], y_true=trueLabel[:, i], zero_division=0, beta=1), decimals=4
    )
    # F2
    statisticDF.loc[(sp, 'f2'), thres] = np.round(
      fbeta_score(y_pred=predictLabel[:, i], y_true=trueLabel[:, i], zero_division=0, beta=2), decimals=4
    )

# If there is na in our results, replace it with 0
statisticDF.fillna(0, inplace=True)
statisticDF.T.to_csv(
  Path.cwd().parent.parent.joinpath('report', 'table', f'{modelWeightPath.stem}.csv'), header=True, index=True
)