In [2]:
cd ..

/home/dmitriishubin/Desktop/physionet-challenge-2020


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import os
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
from metrics.metrics import Metric
from kardioml.data.resample import Resampling

from kardioml.data.p_t_wave_detector import PTWaveDetection

metric = Metric()
resampling = Resampling()

In [4]:
%matplotlib qt

In [9]:
#DATASETS = ['A','B','C','D','E','F']
DATASETS = ['A','B','D','E'] #current for training
#DATASETS = ['D']
datalist = []

for dataset in DATASETS:
    files = [i[:-4] for i in os.listdir(f'./data/{dataset}/formatted/') if i.find('.npy')!=-1]
    for file in files:
        datalist.append(f'./data/{dataset}/formatted/'+file)



# Check the length fits json and numpy

In [None]:
for data in tqdm(datalist):
    signal = np.load(data+'.npy')
    meta = json.load(open(data+'.json'))
    
    if meta['shape'][0] != signal.shape[0]:
        break
    

# Check the length distribution, all datasets

In [None]:
length_list = []
exclusions = []
exclusions_labels = []
exclusions_digits = []

for data in tqdm(datalist):
    meta = json.load(open(data+'.json'))
    if meta['labels_training_merged'] is None:
        continue
    if meta['shape'][0] > 38000:
        file_name = data.split('/')     
        exclusions.append(file_name[-1])
        exclusions_labels.append(meta['labels_full'])
        exclusions_digits.append(meta['labels_training_merged'])
    length_list.append(meta['shape'][0])

    
sns.distplot(length_list)

In [None]:
hist = np.histogram(length_list,2000)
plt.plot(hist[1][:-1][:100],hist[0])

In [None]:
np.percentile(length_list,99)

# Check distribution of classes, calculate weights

In [10]:
labels = []

for data in tqdm(datalist):
    meta = json.load(open(data+'.json'))
    if meta['labels_training_merged'] is None:
        meta['labels_training_merged'] = [0]*27
    
    label = meta['labels_training_merged']
    if label[4] > 0 or label[18] >0:
        label[4] = 1
        label[18] = 1
    if label[23] > 0 or label[12] >0:
        label[23] = 1
        label[12] = 1
    if label[26] > 0 or label[13] >0:
        label[26] = 1
        label[13] = 1
    labels.append(label)
    

    
labels = np.array(labels)

100%|██████████| 32683/32683 [02:37<00:00, 207.99it/s]


In [14]:
weights_arr = np.array([1.]*27)
weights_arr[[4,18,23,12,13,26]] = 0.5
weights_arr

array([1. , 1. , 1. , 1. , 0.5, 1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.5,
       0.5, 1. , 1. , 1. , 1. , 0.5, 1. , 1. , 1. , 1. , 0.5, 1. , 1. ,
       0.5])

In [20]:
weights = np.sum(labels,axis=0)
weights = weights/labels.shape[0]
weights = weights[21]/weights
#weights = weights / max(weights)
weights = weights * weights_arr
weights = weights.tolist()
weights

[11.750153846153845,
 6.577333792628315,
 149.171875,
 70.45756457564576,
 3.7990449661758854,
 15.858803986710964,
 11.742927429274292,
 3.7104547221142634,
 23.57283950617284,
 104.91208791208791,
 24.078184110970998,
 63.85953177257525,
 7.372200772200772,
 49.21134020618557,
 56.15882352941176,
 156.50819672131146,
 34.77959927140255,
 55.50581395348837,
 3.7990449661758854,
 24.385696040868453,
 27.99706744868035,
 1.0,
 16.89734513274336,
 7.372200772200772,
 8.066751161808195,
 63.85953177257525,
 49.21134020618557]

In [None]:
sns.di

# Checking PT detector perfomance

In [None]:
data_range = np.arange(len(datalist))[0:11].tolist()

datalist_sub = [i for index,i in enumerate(datalist) if index in data_range]


for data in tqdm(datalist_sub):
    meta = json.load(open(data+'.json'))
    if meta['shape'][0] > 38000:
        continue
    elif meta['labels_training_merged'] is None:
        continue
    else:
        signal = np.load(data+'.npy')
        
        
        fig = plt.figure(figsize=(20,10))
        for i in range(12):
            channel = signal[:,i]
            plt.plot(channel+i*500)
            plt.plot(meta['p_waves'][i],channel[meta['p_waves'][i]]+i*500,'*')
            plt.plot(meta['t_waves'][i],channel[meta['t_waves'][i]]+i*500,'*')
        plt.show()
        plt.title(str(meta['labels_full'])+' | '+data)
        print('===================')
        print(data)
        print(meta['labels_full'])
        print('===================')
        if data == '../data/A/formatted/A2091':
            break

# Checking downsampling algo

In [207]:
data_range = np.arange(len(datalist))[0:11].tolist()

datalist_sub = [i for index,i in enumerate(datalist) if index in data_range]


for data in tqdm(datalist_sub):
    meta = json.load(open(data+'.json'))
    if meta['shape'][0] > 38000:
        continue
    elif meta['labels_training_merged'] is None:
        continue
    else:
        signal = np.load(data+'.npy')
        
        
        fig = plt.figure(figsize=(20,10))
        for i in range(12):
            channel = signal[:,i]
            plt.plot(channel+i*500)
            channel = resampling.downsample(channel,order=2)
            plt.plot(channel+i*500)
        plt.show()
        plt.title(str(meta['labels_full'])+' | '+data)
        print('===================')
        print(data)
        print(meta['labels_full'])
        print('===================')
        if data == '../data/A/formatted/A2091':
            break

 36%|███▋      | 4/11 [00:00<00:00, 16.97it/s]

./data/A/formatted/A3306
['right bundle branch block']
./data/A/formatted/A6853
['sinus rhythm']
./data/A/formatted/A3233
['1st degree av block']
./data/A/formatted/A3855
['atrial fibrillation']


 82%|████████▏ | 9/11 [00:00<00:00, 12.37it/s]

./data/A/formatted/A5826
['1st degree av block']
./data/A/formatted/A6747
['right bundle branch block']
./data/A/formatted/A0178
['right bundle branch block']


100%|██████████| 11/11 [00:00<00:00, 12.10it/s]

./data/A/formatted/A5676
['atrial fibrillation']
./data/A/formatted/A3929
['1st degree av block']





# Get list of records without labels

In [12]:
#DATASETS = ['A','B','C','D','E','F']
DATASETS = ['A','B','D','E'] #current for training
#DATASETS = ['D']
datalist_nones = []

for dataset in DATASETS:
    files = [i[:-4] for i in os.listdir(f'./data/{dataset}/formatted/') if i.find('.npy')!=-1]
    for file in tqdm(files):
        meta = json.load(open(f'./data/{dataset}/formatted/'+file+'.json'))
        if meta['labels_training_merged'] is None:
            datalist_nones.append(meta['filename'])

100%|██████████| 6877/6877 [00:35<00:00, 192.96it/s]
100%|██████████| 3453/3453 [00:19<00:00, 175.17it/s]
100%|██████████| 516/516 [00:20<00:00, 25.57it/s]
100%|██████████| 21837/21837 [01:19<00:00, 276.02it/s]


In [17]:
dataset_json = {}
dataset_json['data'] = datalist_nones

json.dump(dataset_json, open("additional_data.json","w"))