In [None]:
import sys
# Delete all global variables when re-running the notebook.
this = sys.modules[__name__] # type: ignore
for n in dir():
    if n in ['this', 'was_mounted']: continue
    if n[0]!='_': delattr(this, n)


try:
    was_mounted = was_mounted
except:
    was_mounted = False


import os
if  os.getenv("COLAB_RELEASE_TAG"):
  is_running_on_colab = True

else:
  is_running_on_colab = False

if is_running_on_colab:
  packages_to_install = ['pandas==2.1.3','neurokit2', 'wfdb']

  for package in packages_to_install:
    os.system(f'pip install {package}')
  from google.colab import drive, files
  code_directory = './gdrive/MyDrive/TCC/ectopic_beats_detection'
  if not was_mounted:
      drive.mount('/content/gdrive')
  was_mounted = True
  if not os.path.samefile(os.getcwd(),code_directory):
    os.chdir(code_directory)

from utils import create_compare_df, create_dict_results, plot_results, calculate_metrics, resolve_relative_path
from globals import *
import sys
import neurokit2 as nk
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pyarrow as pa
from os.path import join
import pyarrow as pa
from typing import Any, Dict, Iterable, List, Tuple, Union
from numpy import typing as npt
from utils import Processor, Processors, load_df_multi_analysis, load_record, apply_processors, correct_peaks
from datetime import datetime
import pickle
from multiprocessing import Pool, cpu_count
from timeit import default_timer as timer
import glob

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)

df_record_lead_ann = pd.read_parquet(join(dataframes_directory, 'df_record_lead_ann.parquet'))
df_lead_ann_summery =  pd.read_parquet(join(dataframes_directory, 'df_lead_ann_summery.parquet'))
df_ann_summery = pd.read_parquet(join(dataframes_directory, 'df_ann_summery.parquet'))
df_code_description = pd.read_parquet(join(dataframes_directory, 'df_code_description.parquet'))

df_multi_analysis = load_df_multi_analysis(glob.glob(join(dataframes_directory, 'dict_multi_analysis*.pickle')))

In [None]:
df_multi_analysis[df_multi_analysis[['record_num','processor','method']].duplicated()]

In [None]:
df_multi_analysis.groupby(['record_num']).record_num.count().sort_values().head(5)

# Multiple record analysis

In [None]:
was_interrupted = False
total_time = 300 # seconds

offset = 100 # seconds
discard_start_sec = discard_end_sec = 2

methods = ['neurokit', 'pantompkins1985', 'hamilton2002', 'martinez2004', 'christov2004',
               'gamboa2008', 'elgendi2010', 'engzeemod2012', 'kalidas2017', 'rodrigues2020']
offset = 500 # seconds

derised_anns = LIST_BEATS_1
list_processors = [
    Processor(None),
    Processor('detrend', method = 'polynomial', order = 0),
    Processor('detrend', method = 'polynomial', order = 1),
]

dict_multi_analysis = {}
print(f'Total time = {total_time} seconds')

time_str = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

count_processed = 0
try:
    for idx, row in df_record_lead_ann.iterrows():
        if row['upper_signal'] == 'MLII':
            signal_track = 0
        elif row['lower_signal'] == 'MLII':
            signal_track = 1
        else:
            continue

        record_num = row['record']

        # Start measuring time
        start_time = timer()

        print(f'Processing record {record_num}')

        # Load record
        record, ann = load_record(record_num)
        fs = int(record.fs)  # type: ignore

        samples = int(total_time * fs)
        start_samples = int(offset * fs)
        end_samples = start_samples + samples
        
        dict_multi_analysis[record_num] = {}
        for processor in list_processors:
            processor_name = processor.processor_name
            print(f'Processor =  {processor_name}')
            methods_done = df_multi_analysis[
            (df_multi_analysis.record_num == record_num)
            & (df_multi_analysis.signal_track == signal_track)
            & (df_multi_analysis.start_samples == start_samples)
            & (df_multi_analysis.end_samples == end_samples)
            & (df_multi_analysis.processor == processor_name)
            ].method.to_list()

            methods_missing = [method for method in methods if method not in methods_done]

            if len(methods_missing) == 0:
                print('All methods already processed')
                continue

            
            #dict_multi_analysis[record_num][processor_name] = {}

            #df_beats, ecg, start_samples, end_samples, fs = create_df_beats(record_num, total_time, offset, LIST_BEATS_1, signal_track)

            # ECG signal
            ecg = record.p_signal[:, signal_track][start_samples:end_samples]  # type: ignore

            if processor.processor_name:
                ecg = apply_processors(ecg, processor)

            ecg = pd.Series(ecg, dtype=ECG_TYPE)
            ecg.index += start_samples
            ann_beat_indexes = pd.Series(ann.sample, dtype=INDEX_TYPE)
            ann_beat_symbols = pd.Series(ann.symbol, dtype=ANN_TYPE)

            # Mask for time window and derised annotations
            mask_derised_ann = ann_beat_symbols.isin(derised_anns)

            # We are only interested in samples in the time window
            mask_time_window = (ann_beat_indexes >= start_samples) & (
                ann_beat_indexes < end_samples)
            mask_used_ann = mask_time_window & mask_derised_ann

            # Apply mask
            ann_beat_indexes = ann_beat_indexes[mask_used_ann].reset_index(drop=True)
            ann_beat_symbols = ann_beat_symbols[mask_used_ann].reset_index(drop=True)

            df_beats = correct_peaks(ecg, ann_beat_indexes, fs)

            df_beats = df_beats.rename(columns={'index': 'peak_index', 'local_max': 'cor_peak_index'}).merge(
                pd.DataFrame({'peak_index': ann_beat_indexes, 'symbol': ann_beat_symbols}), on='peak_index', how='left', validate='one_to_one')
            
            #df_beats.cor_peak_index = df_beats.peak_index

            # If the peak is not corrected, use the original peak index
            df_beats.loc[df_beats.cor_peak_index.isna(
            ), 'cor_peak_index'] = df_beats.peak_index


            first_used_sample = start_samples + discard_start_sec * fs
            last_used_sample = end_samples - discard_end_sec * fs

            #dict_results = create_dict_results(ecg, methods,start_samples, first_used_sample, last_used_sample, fs, discard_start_sec, discard_end_sec)
            p = Pool(cpu_count())
            list_results = p.starmap(create_dict_results, [(ecg, [method], start_samples, first_used_sample, last_used_sample, fs, discard_start_sec, discard_end_sec) for method in methods_missing])
            dict_results = {}
            for result in list_results:
                for key, value in result.items():
                    dict_results[key] = value

            # Now the operations are performed on the time window, we can discard the first and last n seconds
            ecg = ecg.loc[first_used_sample:last_used_sample ]
            df_beats = df_beats[(df_beats.peak_index >= first_used_sample) & (df_beats.peak_index <= last_used_sample)].reset_index(drop = True)

            df_comp_methods = create_compare_df(df_beats, dict_results)

            dict_metrics = calculate_metrics(df_comp_methods, methods_missing)

            for method in dict_metrics.keys():
                dict_metrics[method]['signal_track'] = signal_track
                dict_metrics[method]['start_samples'] = start_samples
                dict_metrics[method]['end_samples'] = end_samples
            

            dict_multi_analysis[record_num][processor_name] = dict_metrics
            count_processed += 1
        end_time = timer()
        print(f'Time elapsed = {end_time - start_time} seconds')
except KeyboardInterrupt:
    if count_processed > 0:
        with open(join(dataframes_directory, f'dict_multi_analysis_interrupted_{time_str}.pickle'), 'wb') as handle:
            pickle.dump(dict_multi_analysis, handle, protocol=pickle.HIGHEST_PROTOCOL)
        was_interrupted = True

# save results as pickle
if not was_interrupted and count_processed > 0:
    with open(join(dataframes_directory, f'dict_multi_analysis_{time_str}.pickle'), 'wb') as handle:
            pickle.dump(dict_multi_analysis, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
x_xis_factor = 1

fig = plot_results(
    dict_results, df_beats, ecg,
    [
        (
            df_code_description.symbol[~df_code_description.symbol.isin(
                LIST_BEATS_1)],
            dict(mode="markers", marker=dict(size=4, color="black"))
        ),

        (LIST_BEATS_1, dict(mode="markers", marker=dict(size=9, color="red")))
    ],

    x_xis_factor=1
)

#Define x zoom
fig.update_xaxes(range=[271700,272300])

fig.show()


In [None]:
df_multi_analysis.groupby(['method'])[['accuracy']].agg(['mean', 'min', 'std']).sort_values(('accuracy', 'min'), ascending=False)

In [None]:
df_multi_analysis[df_multi_analysis.processor == 'None'].groupby(['method'])[['accuracy']].agg(['mean', 'min','std']).sort_values(('accuracy', 'min'), ascending=False) / df_multi_analysis.groupby(['method'])[['accuracy']].agg(['mean', 'min', 'std']).sort_values(('accuracy', 'min'), ascending=False) -1