In [None]:
from utils import *
from glob import glob

Source: https://archive.physionet.org/physiobank/database/html/mitdbdir/

# Selection criteria
The source of the ECGs included in the MIT-BIH Arrhythmia Database is a set of over 4000 long-term Holter recordings that were obtained by the Beth Israel Hospital Arrhythmia Laboratory between 1975 and 1979. Approximately 60% of these recordings were obtained from inpatients. The database contains 23 records (numbered from 100 to 124 inclusive with some numbers missing) chosen at random from this set, and 25 records (numbered from 200 to 234 inclusive, again with some numbers missing) selected from the same set to include a variety of rare but clinically important phenomena that would not be well-represented by a small random sample of Holter recordings. Each of the 48 records is slightly over 30 minutes long.

The first group is intended to serve as a representative sample of the variety of waveforms and artifact that an arrhythmia detector might encounter in routine clinical use. A table of random numbers was used to select tapes, and then to select half-hour segments of them. Segments selected in this way were excluded only if neither of the two ECG signals was of adequate quality for analysis by human experts.

Records in the second group were chosen to include complex ventricular, junctional, and supraventricular arrhythmias and conduction abnormalities. Several of these records were selected because features of the rhythm, QRS morphology variation, or signal quality may be expected to present significant difficulty to arrhythmia detectors; these records have gained considerable notoriety among database users.

The subjects were 25 men aged 32 to 89 years, and 22 women aged 23 to 89 years. (Records 201 and 202 came from the same male subject.)


# Data Exploration

In [None]:
# Load data from MIT-BIH Arrhythmia Database
# https://physionet.org/content/mitdb/1.0.0/

dict_signals = {}
list_annotations = []
list_symbols = []


files = glob(f'{mb_artm_directory}*dat')
for file in files:
    record_path = file.replace(".dat",'')
    record = wfdb.rdrecord(record_path)
    dict_signals[(record.record_name)] = record.sig_name
    ann = wfdb.rdann(record_path,'atr')
    dict_symbols_count = pd.Series(ann.symbol).value_counts().to_dict()
    list_symbols += list(dict_symbols_count.keys())
    list_annotations.append(dict_symbols_count)
columns = ['upper_signal', 'lower_signal']

# Dataframe with lead configurations for upper and lower signals for each record
df_record_lead = pd.DataFrame(dict_signals, index=columns).T.reset_index().rename(columns={'index':'record'})
df_record_lead.record = df_record_lead.record.astype(np.int32)
df_record_lead['group'] = 'random'
df_record_lead.loc[df_record_lead.record >= 200, 'group'] = 'selected'


# Dataframe with the number of configurations for upper and lower signals
df_record_lead_summery = df_record_lead.groupby(['group', 'upper_signal','lower_signal']).count().reset_index().sort_values('record', ascending=False, ignore_index=True)

# Dataframe with the number of annotations for each record
df_ann = pd.DataFrame(list_annotations).fillna(0)
df_ann = df_ann.astype(int)
df_ann.insert(0, 'record', df_record_lead.record)

# Display first values of dataframes
print("df_record_lead")
display(df_record_lead.head())

print("df_record_lead_summery")
display(df_record_lead_summery)

print("df_ann")
display(df_ann.head())

In [None]:
set(list_symbols)

In [None]:
# Dataframe with description of each annotation code
df_code_description = pd.concat(pd.read_html("https://archive.physionet.org/physiobank/annotations.shtml")[:2])[['Code', 'Description']].dropna().reset_index(drop=True)

# Display first values of dataframe
print("df_code_description")
df_code_description.head()

In [None]:
series_beat_codes = df_code_description.Code.iloc[:19]
print(series_beat_codes)

In [None]:
# Dataframe with the number of annotations for each code and the respective description
df_ann_summery =  df_ann[df_ann.columns.to_list()[:-1]].sum(axis=0).reset_index().rename(columns={'index':'Code', 0:'Count'}).merge(df_code_description, on='Code').sort_values('Count', ascending=False).reset_index(drop=True)

# Display dataframe
df_ann_summery

In [None]:
# Create a more comprehensive summery with number of annotations for each code for each lead configuration 

df_record_lead_ann = df_record_lead.merge(df_ann, on='record')


df_lead_ann_summery = df_record_lead_ann.groupby(['group','upper_signal','lower_signal'])[df_ann_summery.Code[:-1]].sum().reset_index().sort_values('N', ascending=False, ignore_index= True)

# Display first values of dataframe

print("df_record_lead_ann")
display(df_record_lead_ann.head())

print("df_lead_ann_summery")
display(df_lead_ann_summery)

In [None]:
df_record_lead_ann.to_parquet(join(dataframes_directory, 'df_record_lead_ann.parquet'))
df_lead_ann_summery.to_parquet(join(dataframes_directory, 'df_lead_ann_summery.parquet'))

# Conclusions

- The majority of the records use the MLII-V1 lead configuration (40/48)
- In the random selected records, 15 out of 23 use the MLII-V1 lead configuration
- In the specially selected records, all 25 records use the MLII-V1 lead configuration