In [None]:
import sys
import os
data_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install tensorflow -q
    !pip install keras -q
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install matplotlib -q
    !pip install umap-learn
    !pip install seaborn

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'


In [None]:
# Verify installation and import libraries
import tensorflow as tf
import keras
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from matplotlib.colors import ListedColormap
from sklearn.manifold import TSNE

Path = dict({
    'mitbih_test': data_path +  'mitbih_test.csv',
    'mitbih_train': data_path +  'mitbih_train.csv',
    'ptbdb_normal': data_path +  'ptbdb_normal.csv',
    'ptbdb_abnormal':  data_path + 'ptbdb_abnormal',
})

In [None]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df

def getBarChartFromCategoryValueCounts(category_value_counts):
    """
    We call the plot over the pandas series object to plot the category count values
    """
    plt.figure(figsize=(10, 6))
    bar_chart = category_value_counts.plot(kind='bar')
    plt.xlabel('Categories')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(False)
    plt.xticks(rotation=360)
    for i in bar_chart.containers:
        bar_chart.bar_label(i, label_type='edge')
    plt.show()


def showTop10DataInChart(df):
    plt.figure(figsize=(10, 6))
    xDataAxis = list(range(0, df.shape[1]))
    yDataRows = list(df.values[1: 10])
    for y in yDataRows:
        plt.plot(xDataAxis, y)
    plt.show()

In [None]:
mitbih_train = pd.read_csv(Path.get('mitbih_train'), header=None ) 
mitbih_train_with_columns = addColumnsToDataframe(mitbih_train) # add columns to the dataframe
mitbih_train_with_columns = convertColumnAsInt(mitbih_train_with_columns, 'target') # convert the target column to int

In [None]:
mitbih_test = pd.read_csv(Path.get('mitbih_test'), header=None )
mitbih_test_with_columns = addColumnsToDataframe(mitbih_test)
mitbih_test_with_columns = convertColumnAsInt(mitbih_test_with_columns, 'target')

In [None]:
#train:
print(mitbih_train.shape)
print("The train data has 87554 and 188 columns.")

#test:
print(mitbih_test.shape)
print("The test data has 21891 and 188 columns.")

In [None]:
print(mitbih_train["target"])
print(mitbih_test["target"])

In [None]:
print(mitbih_train.dtypes)
print(mitbih_test.dtypes)
print("The features are numeric. All columns have the type float64, while the target column is int32.")

In [None]:
print(mitbih_test.info(show_counts=True), end="\n\n")
print("Size of the DataFrame", mitbih_test.shape, end='\n\n')

print(mitbih_train.info(show_counts=True), end="\n\n")
print("Size of the DataFrame", mitbih_train.shape, end='\n\n')


In [None]:
nb_rows_duplicated = mitbih_train.duplicated().sum()
print("Number of rows duplicated :", nb_rows_duplicated)

nb_rows_duplicated = mitbih_test.duplicated().sum()
print("Number of rows duplicated :", nb_rows_duplicated)

In [None]:
print("Missing values in train:", mitbih_train.isnull().sum())
print("Missing values in test:", mitbih_test.isnull().sum())
print("The data has no missing values.")

In [None]:
mitbih_train.head()

In [None]:
mitbih_test.head()

Visualize Target 

In [None]:
# Define mapping dictionary
class_mapping = {
    0: 'Normal',
    1: 'Supraventricular',
    2: 'Premature',
    3: 'Fusion',
    4: 'Unclassifiable'
}

# Define custom colors for each category
color_mapping = {
    0: 'green',    # Normal beat
    1: 'yellow',   # Supraventricular premature beat
    2: 'red',      # Premature ventricular contraction
    3: 'orange',   # Fusion of ventricular and normal beat
    4: 'gray'      # Unclassifiable beat
}

classes_to_plot = [0, 1, 2, 3, 4]

# Calculate value counts based on mapped class names
value_counts_series_train = mitbih_train['target'].map(class_mapping).value_counts()
# Calculate value counts based on mapped class names
value_counts_series_test = mitbih_test['target'].map(class_mapping).value_counts()

In [None]:
print("Training Dataset Class Distribution:")
print(value_counts_series_train)

print("\nTesting Dataset Class Distribution:")
print(value_counts_series_test)


Barplots

In [None]:
# train data

plt.figure(figsize=(10, 6)) # Set the figure size
bar_chart = value_counts_series_train.plot(kind='bar', color=color_mapping.values()) # Create a bar chart for the value counts of the target column in the training dataset
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Distribution of Target Categories (Training Dataset)')
plt.xticks(rotation=45, ha='right') # Rotate the x-axis labels

# Add labels to the bars
for container in bar_chart.containers: # Iterate over the bar containers
    plt.bar_label(container, label_type='edge') # Add labels to the bars

plt.tight_layout()
plt.show()

# test data 
plt.figure(figsize=(10, 6))
bar_chart = value_counts_series_test.plot(kind='bar', color=color_mapping.values())
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Distribution of Target Categories (Testing Dataset)')
plt.xticks(rotation=45, ha='right')

# Add labels to the bars
for container in bar_chart.containers:
    plt.bar_label(container, label_type='edge')

plt.tight_layout()
plt.show()

Plot of each class once

In [None]:
# Function to plot overlay of ECG signals from both datasets for a single class
def plot_overlay_ecg_signals(df, label, color, dataset_label): # Define a function to plot overlay of ECG signals for a single class from both datasets
    """
    Plot overlay of ECG signals for a single class from both datasets.

    Parameters:
    df (DataFrame): DataFrame containing ECG signals and target labels
    label (str or int): Class label to plot
    color (str): Color for the plot
    dataset_label (str): Label for the dataset (e.g., 'Training', 'Testing')
    """

    # Map the descriptive label to the corresponding class label
    if isinstance(label, str):
        class_label = [k for k, v in class_mapping.items() if v == label][0] # Get the class label for the specified class name
    else:
        class_label = label # Use the specified class label
    
    # Extract data rows for the specified class label
    class_data = df[df['target'] == class_label]

    # Check if there is any data for the specified class label
    if class_data.empty:
        print(f"No data found for class {class_mapping[class_label]}")
        return
    
    # Extract a sample data row (first row) for the specified class label
    sample_data = class_data.iloc[0]

    # Plot the sample ECG signal, excluding the 'target' column
    plt.plot(sample_data[:-1], label=f'{dataset_label}: {class_mapping[class_label]}', color=color)

    plt.title(f"Overlay of ECG Signals - {dataset_label}")
    plt.ylabel('Amplitude')
    plt.grid(True)
    plt.xticks([])  # Remove x-axis ticks and labels
    plt.legend()

# List of classes to plot
classes_to_plot = ['Normal', 'Premature', 'Supraventricular', 'Fusion', 'Unclassifiable']

# Colors for each class
colors = ['green', 'orange', 'yellow', 'red', 'gray']

# Plotting overlay for each class in both datasets
plt.figure(figsize=(12, 8))

# Plotting for training dataset
for label, color in zip(classes_to_plot, colors):
    plot_overlay_ecg_signals(mitbih_train, label, color, 'Training')

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))

# Plotting for testing dataset
for label, color in zip(classes_to_plot, colors):
    plot_overlay_ecg_signals(mitbih_test, label, color, 'Testing')

plt.tight_layout()
plt.show()