In [None]:
import os
import glob

PATH_DATASET = "/kaggle/input/physionet-ecg-image-digitization"

dataset = {}
for pdir in glob.glob(os.path.join(PATH_DATASET, "train", "*")):
    spl = os.path.basename(pdir)
    imgs = glob.glob(os.path.join(pdir, "*.png"))
    dataset[spl] = len(imgs)

print(f"samples: {set(dataset.values())}")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random

# Choose a random subfolder from the dataset
subfolder = random.choice(list(dataset.keys()))
subfolder_path = os.path.join(PATH_DATASET, "train", subfolder)

# Get all image files in the chosen subfolder
image_files = glob.glob(os.path.join(subfolder_path, "*.png"))

# Create a 3x3 grid to display the images
fig, axes = plt.subplots(3, 3, figsize=(12, 12))
axes = axes.ravel() # Flatten the 2D array of axes for easy iteration

# Display each image
for i, img_path in enumerate(image_files):
    img = mpimg.imread(img_path)
    axes[i].imshow(img)
    axes[i].axis('off') # Hide the axes

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Assuming the CSV file has the same name as the subfolder with a .csv extension
csv_file_name = f"{os.path.basename(subfolder_path)}.csv"
csv_file_path = os.path.join(subfolder_path, csv_file_name)

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)
print(f"CSV file '{csv_file_name}' loaded successfully.")
display(df.head())

In [None]:
import numpy as np

# Function to find continuous non-NaN intervals in a pandas Series
def find_non_nan_intervals(series):
    intervals = []
    start_index = None
    for i, value in series.items():
        if pd.notna(value):
            if start_index is None:
                start_index = i
        elif start_index is not None:
            intervals.append((start_index, i - 1))
            start_index = None
    if start_index is not None:  # Handle case where non-NaN values extend to the end
        intervals.append((start_index, len(series) - 1))
    return intervals

# Identify intervals for each column
column_intervals = {}
for col in df.columns:
    column_intervals[col] = find_non_nan_intervals(df[col])

# Display the identified intervals
for col, intervals in column_intervals.items():
    print(f"Intervals for column '{col}': {intervals}")

In [None]:
# Plot all non-NaN intervals in a single chart with different colors
plt.figure(figsize=(12, 6))

# Get a colormap and create a list of colors
cmap = plt.colormaps.get_cmap('tab10')
colors = [cmap(i) for i in np.linspace(0, 1, len(df.columns))]

for i, (col, intervals) in enumerate(column_intervals.items()):
    for start, end in intervals:
        plt.plot(df.index[start:end+1], df[col].iloc[start:end+1], color=colors[i], label=f'{col} ({start}-{end})')

plt.title(f"All Non-NaN Intervals from {os.path.basename(subfolder_path)}.csv")
plt.xlabel("Index")
plt.ylabel("Value")
# Place the legend outside the plot
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()

In [None]:
# Create a combined plot with interval plots and histograms side-by-side
n_cols_grid = 2  # Two columns in the grid
n_rows_grid = len(df.columns) # One row for each column

fig, axes = plt.subplots(nrows=n_rows_grid, ncols=n_cols_grid, figsize=(16, 2 * n_rows_grid))
axes = axes.ravel() # Flatten the axes array for easy iteration

# Determine the overall x-axis limits for interval plots
all_indices = df.index
min_x = all_indices.min()
max_x = all_indices.max()

for i, col in enumerate(df.columns):
    # Plot intervals in the left column
    ax_intervals = axes[i * n_cols_grid]
    intervals = column_intervals.get(col, []) # Get intervals for the current column
    for start, end in intervals:
        ax_intervals.plot(df.index[start:end+1], df[col].iloc[start:end+1], label=f'{col} ({start}-{end})')
    ax_intervals.set_title(f'Column: {col} (Intervals)')
    ax_intervals.set_xlabel("Index")
    ax_intervals.set_ylabel("Value")
    ax_intervals.grid(True)
    ax_intervals.legend(loc='upper right')
    ax_intervals.set_xlim([min_x, max_x]) # Set the same x-axis limits for interval plots

    # Plot histogram in the right column
    ax_hist = axes[i * n_cols_grid + 1]
    df[col].hist(bins=50, ax=ax_hist)
    ax_hist.set_title(f"Histogram of {col}")
    ax_hist.set_xlabel("Value")
    ax_hist.set_ylabel("Frequency")
    ax_hist.grid(True)

plt.tight_layout()
plt.show()