In [1]:
%load_ext autoreload
%autoreload 2

from abc import ABC
import warnings
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from signal_dataset import SignalDataset
from sklearn.metrics import classification_report
import re
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y
import yaml
import re
from pathlib import Path
import abc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

import yaml
from typing import Callable
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import trange
import seaborn as sns
import networks
from signal_dataset import SignalDataset


In [2]:
sr = 1562500
signal_data_dir = "/home/petr/Documents/Motor_project/AE_PETR_motor/"
bin_setup = [{"label": i.stem, "channels": len(list(i.glob('*.bin'))), "interval": [0, 15 * sr], "bin_path": list(i.glob('*.bin'))[0]} for i in
             Path(signal_data_dir).glob('WUP*') if re.search(r'\d$', i.stem)]

sd = SignalDataset(step=1000, window_size=1000, bin_setup=bin_setup, device="cpu", source_dtype="float32")

In [3]:
train_data, test_data = random_split(sd, [0.8, 0.2])

In [4]:

DEVICE = "cuda"


class SignalModel:
    def __init__(self, config_path: Path):
        self.config_path = config_path
        self._load_config()
        self._model = self.init_model()
        self._transform = self.init_transform()
        self._load_config()
        
    @abc.abstractmethod
    def _load_config(self):
        pass
    
    @abc.abstractmethod
    def init_model(self):
        pass

    @abc.abstractmethod
    def train(self, train_dataset: Dataset, test_dataset: Dataset) -> None:
        pass

    @abc.abstractmethod                   # raw outputs, targets
    def _evaluate(self, dataset: Dataset) -> (np.ndarray, np.ndarray):
        pass
    
    @abc.abstractmethod
    def predict(self, x: np.ndarray) -> np.ndarray:
        """
        Predicts the label of one or multiple signals.
        """
        pass
    
    @abc.abstractmethod
    def save(self, path: Path):
        pass
    
    def init_transform(self) -> Callable:

        def transform(x: np.ndarray) -> np.ndarray:
            return x

        return transform
    
    # def _evaluate(self, x: np.ndarray) -> np.ndarray:
    #     x = self._transform(x)
    #     return self.inference(x)

    def plot_confusion_matrix(self, x_input: np.ndarray, y_true: np.ndarray):
        y_pred = self._model.predict(x_input)
        class_num = 9  # in future will not be hard coded
        cm = confusion_matrix(y_true, y_pred, labels=np.arange(class_num))
        plt.figure(figsize=(10, 10))
        plt.imshow(cm, cmap='Greens')
        for i in range(class_num):
            for j in range(class_num):
                plt.text(j, i, cm[i, j], ha="center", va="bottom", color='gray')
                plt.text(j, i, str(j), ha="center", va="top", color='gray')

    def accuracy(self, x_input: np.ndarray, y_true: np.ndarray) -> float:
        y_pred = self._model.predict(x_input)
        return accuracy_score(y_true, y_pred)
    
    def precision_and_recall(self, x_input: np.ndarray, y_true: np.ndarray) -> (float, float):
        y_pred = self._model.predict(x_input)
        cr = classification_report(y_true, y_pred, zero_division=0, output_dict=True)
        return cr["macro avg"]["precision"], cr["macro avg"]["recall"]
    
    def classification_report(self, x_input: np.ndarray, y_true: np.ndarray) -> dict:
        y_pred = self._model.predict(x_input)
        return classification_report(y_true, y_pred, zero_division=0, output_dict=True)

In [5]:
# x_test = np.asarray([data[0] for data in dataloader])
# y_test = np.asarray([data[1] for data in dataloader])
class SklearnModel(SignalModel, ABC):
    def __init__(self, config_path: Path):
        super().__init__(config_path)
        self.config_path = config_path
        self._model = self.init_model()
    def _load_config(self):
        with self.config_path.open(mode="r") as yaml_file:
            self.config = yaml.load(yaml_file, Loader=yaml.SafeLoader)
            self.model_config = self.config["model"]
    def init_model(self):
        match self.model_config["type"]:
            case "DummyClassifier":
                return DummyClassifier()
            case "RandomForestClassifier":
                return RandomForestClassifier()
    def train(self, train_dataset: Dataset, test_dataset: Dataset) -> None:
        x_train = np.asarray([data[0] for data in train_dataset])
        y_train = np.asarray([data[1] for data in train_dataset])
        self._model.fit(x_train, y_train)    
        
    def predict(self, x: np.ndarray) -> np.ndarray:
        return self._model.predict(x)
    
    def _evaluate(self, dataset: Dataset) -> np.ndarray:
        x = np.asarray([data[0] for data in dataset])        
        return self.predict(x)
        

In [6]:
class NeuroNet(SignalModel, ABC):

    def __init__(self, config_path: Path, tensorboard: bool = False):
        super().__init__(config_path)
        self.config_path = config_path
        self.tensorboard = tensorboard

        self.criterion = nn.CrossEntropyLoss()
        self.writer = SummaryWriter(
            comment=f"_{config_path.stem}_{self.config['eval_params']['batch_size']}")
        
        self.train_loss = []
        self.val_loss = []
        self.val_accuracy = []
        self.total_batch_id = 1
        self.epoch_trained = 0
        self.train_set: bool
        self.pretrained = False 
        
    def _load_config(self) -> None:
        with self.config_path.open(mode="r") as yaml_file:
            self.config = yaml.load(yaml_file, Loader=yaml.SafeLoader)
            self.model_config = self.config["model"]

        if isinstance(self.model_config["kwargs"]["layers"], list):
            self.layers_configs = []
            for layer_config in self.model_config["kwargs"]["layers"]:
                self.layers_configs.append(layer_config)
        else:
            self.layers_configs = {}
            for name, kwargs in self.model_config["kwargs"]["layers"].items():
                self.layers_configs[name] = kwargs

    def init_model(self):
        # TODO: make in_channels as parameter
        match self.model_config["type"]:
            case "MLP":
                return networks.MLP(self.layers_configs)
            case "Inception time" | "Inception" | "Inception_time":
                return networks.InceptionTime(self.layers_configs)
            case "LSTM" | "GRU":
                # return networks.RNN(self.layers_configs)
                return networks.RNN(self.layers_configs, attention=self.model_config["attention"])
            case "CNN":
                return networks.CNNOld()
            case "LSTM-FCN" | "lstm_fcn":
                return networks.RnnFcn(self.layers_configs)

    def train(self, train_dataset: Dataset, test_dataset: Dataset) -> None:

        self._load_config()
        self._model.to(DEVICE)

        optimizer = optim.AdamW(self._model.parameters(), self.config["lr"])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=self.config["training_params"][
                                                                   "epoch_num"])

        train_dataloader = DataLoader(train_dataset,
                                      **self.config["training_params"].get("dataloader_params", {}))
        test_dataloader = DataLoader(test_dataset, **self.config["eval_params"])

        epochs = trange(self.config["training_params"]["epoch_num"], ncols=100)  # , desc='Epoch #', leave=True)

        for epoch in epochs:
            for (inputs, targets) in train_dataloader:
                self._model.train()
                inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                optimizer.zero_grad()
                outputs = self._model(inputs)
                loss = self.criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                self.train_loss.append(loss)
                self.writer.add_scalar(tag='Loss/train', scalar_value=loss, global_step=self.total_batch_id)
                
                if self.tensorboard:
                    self.train_set = False
                    self.calculate_metrics(test_dataset)
                    self.train_set = True
                    self.calculate_metrics(train_dataset)

                epochs.set_description(f"Epoch #{self.epoch_trained + 1}")
                self.total_batch_id += 1
                
            last_lr = scheduler.get_last_lr()[0]
            self.writer.add_scalar(tag='learning rate', scalar_value=last_lr, global_step=self.epoch_trained)
            self.epoch_trained += 1
            scheduler.step()

            # epochs.refresh()
            #
            # self.eval_model(testing_data, writer, )

            self.pretrained = True
            if self.tensorboard:
                warnings.warn("Tensorboard summary writer is not closed")
            
    def predict(self, x: np.ndarray) -> np.ndarray:
        x = torch.from_numpy(x).to(DEVICE)
        if x.ndim == 1:
            x = torch.reshape(x, (1, -1))
        # self._model.to("cpu")
        output = self._model(x)
        prediction = torch.argmax(output, dim=1)
        return prediction.cpu().numpy()

    def _evaluate(self, dataset: Dataset) -> (np.ndarray, np.ndarray):
        self._model.eval()
        outputs = []
        targets = []
        if self.train_set:
            dataloader = DataLoader(dataset, **self.config["training_params"].get("dataloader_params", {}))
        else:
            dataloader = DataLoader(dataset, **self.config["eval_params"])
        
        with torch.no_grad():
            for i, (input, target) in enumerate(dataloader):
                input, target = input.to(DEVICE), target.to(DEVICE)
                output = self._model(input)
                outputs.extend(output.cpu().numpy())  # appends list of predictions for one batch to predictions over selected set
                targets.extend(target.cpu().numpy())
        # shapes: (num_of_examples, 9), (num_of_examples,)
        return np.asarray(outputs), np.asarray(targets)

    def calculate_metrics(self, dataset: Dataset) -> None:
        if (self.total_batch_id % self.config["metrics"]["confusion_matrix"] == 0 or 
            self.total_batch_id % self.config["metrics"]["classification_report"] == 0 or
            self.total_batch_id % self.config["metrics"]["validation_loss"] == 0):
            
            class_num = 9
            tag = "train" if self.train_set else "val"
            outputs, targets = self._evaluate(dataset)
            predictions = np.argmax(outputs, axis=1)  # makes the correct predictions
            
            if not self.train_set and self.total_batch_id % self.config["tensorboard_params"]["validation_loss"] == 0:
                val_loss = self.criterion(torch.tensor(outputs),torch.tensor(targets))
                self.writer.add_scalar(tag=f'Loss/{tag}', scalar_value=val_loss, global_step=self.total_batch_id)

            if self.total_batch_id % self.config["metrics"]["classification_report"] == 0:
                cr = classification_report(targets, predictions, labels=np.arange(class_num), output_dict=True, zero_division=0)
                
                self.writer.add_scalar(tag=f'Accuracy/{tag}', 
                                       scalar_value=cr["accuracy"], global_step=self.total_batch_id)
                self.writer.add_scalar(tag=f'Precision/{tag}', 
                                       scalar_value=cr["macro avg"]["precision"], global_step=self.total_batch_id)
                self.writer.add_scalar(tag=f'Recall/{tag}', 
                                       scalar_value=cr["macro avg"]['recall'], global_step=self.total_batch_id)
                self.writer.add_scalar(tag=f'F1-score/{tag}', 
                                       scalar_value=cr["macro avg"]["f1-score"], global_step=self.total_batch_id)
                
    
            if self.total_batch_id % self.config["tensorboard_params"]["confusion_matrix"] == 0:
                plt.figure(figsize=(12, 7))
                cm = confusion_matrix(targets, predictions, labels=np.arange(class_num))
                df_cm = pd.DataFrame(cm / np.sum(cm, axis=1)[:, None], index=[i for i in range(class_num)],
                                     columns=[i for i in range(class_num)])
                self.writer.add_figure(tag=f"Confusion matrix/{tag}",
                                       figure=sns.heatmap(df_cm, annot=True, fmt=".1f").get_figure(),
                                       global_step=self.total_batch_id)
            
    def save(self, path: str) -> None:
        torch.save(self._model, Path(path))

    def close_writer(self):
        if self.tensorboard:
            self.writer.close()
            print("Tensorboard summary writer is closed")
        else: print("No Tensorboard summary writer found")

In [15]:

# dummy_classifier = SklearnModel(Path("non_dl_yaml_configs/dummy_classifier.yaml"))
neuro_net = NeuroNet(Path("nn_yaml_configs/LSTM-FCN.yaml"))


In [16]:
neuro_net.train(train_data, test_data)

Epoch #1: 100%|███████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.98s/it]


In [32]:
neuro_net.close_writer()

No Tensorboard summary writer found


In [17]:
dummy_classifier = SklearnModel(Path("non_dl_yaml_configs/dummy_classifier.yaml"))

In [7]:
x_test = np.asarray([data[0] for data in test_data]) 
y_test = np.asarray([data[1] for data in test_data]) 

In [8]:
x_test.shape

(42186, 1000)

In [19]:

dummy_classifier.train(train_data, test_data)



In [20]:
models = [dummy_classifier, neuro_net]

In [21]:
for model in models:
    y_pred = model.predict(x_test)
    print(f"{model}: {model.accuracy(y_test, y_pred)}")

<__main__.SklearnModel object at 0x7c9c57fa9650>: 1.0


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.01 GiB. GPU 0 has a total capacty of 7.79 GiB of which 409.62 MiB is free. Including non-PyTorch memory, this process has 7.37 GiB memory in use. Of the allocated memory 6.47 GiB is allocated by PyTorch, and 706.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [19]:
cr = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

In [15]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4704
           1       0.00      0.00      0.00      4756
           2       0.00      0.00      0.00      4708
           3       0.11      1.00      0.20      4590
           4       0.00      0.00      0.00      4737
           5       0.00      0.00      0.00      4616
           6       0.00      0.00      0.00      4663
           7       0.00      0.00      0.00      4657
           8       0.00      0.00      0.00      4755

    accuracy                           0.11     42186
   macro avg       0.01      0.11      0.02     42186
weighted avg       0.01      0.11      0.02     42186



In [16]:
cr["accuracy"]

0.10880386858199402

In [23]:
type(cr["macro avg"]["precision"])

float