In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Path = dict({
    'ptbdb_normal': '../data/raw/ptbdb_normal.csv',
    'ptbdb_abnormal': '../data/raw/ptbdb_abnormal.csv',
    'mitbih_test': '../data/raw/mitbih_test.csv',
    'mitbih_train': '../data/raw/mitbih_train.csv',
})



In [None]:

def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df

def getBarChartFromCategoryValueCounts(category_value_counts):
    """
    We call the plot over the pandas series object to plot the category count values
    """
    plt.figure(figsize=(10, 6))
    bar_chart = category_value_counts.plot(kind='bar')
    plt.xlabel('Categories')
    plt.ylabel('Count')
    plt.legend()
    plt.grid(False)
    plt.xticks(rotation=360)
    for i in bar_chart.containers:
        bar_chart.bar_label(i, label_type='edge')
    plt.show()


def showTop10DataInChart(df):
    plt.figure(figsize=(10, 6))
    xDataAxis = list(range(0, df.shape[1]))
    yDataRows = list(df.values[1: 10])
    for y in yDataRows:
        plt.plot(xDataAxis, y)
    plt.show()

In [None]:

ptbdb_normal = pd.read_csv(Path.get('ptbdb_normal'), header=None )
ptbdb_normal_with_columns = addColumnsToDataframe(ptbdb_normal)

# convert target varaible from float to string
ptbdb_normal_with_columns = convertColumnAsInt(ptbdb_normal_with_columns, 'target')

# get the category value counts.
ptbdb_normal_category_counts = ptbdb_normal_with_columns['target'].value_counts()

#getBarChartFromCategoryValueCounts(ptbdb_normal_category_counts)

Check the meanings of the Data Rows(first 10 rows as example):

In [None]:
showTop10DataInChart(ptbdb_normal_with_columns)

In [None]:
ptbdb_abnormal = pd.read_csv(Path.get('ptbdb_abnormal'), header=None )
ptbdb_abnormal_with_columns = addColumnsToDataframe(ptbdb_abnormal)
ptbdb_abnormal_with_columns = convertColumnAsInt(ptbdb_abnormal, 'target')
ptbdb_abnormal_category_count = ptbdb_abnormal_with_columns['target'].value_counts()
#getBarChartFromCategoryValueCounts(ptbdb_abnormal_category_count)

Check the sample of abnormal heartbeat signal(first 10 sample as):

In [None]:
showTop10DataInChart(ptbdb_abnormal_with_columns)

In [None]:
mitbih_train = pd.read_csv(Path.get('mitbih_test'), header=None )
mitbih_train_with_columns = addColumnsToDataframe(mitbih_train)
mitbih_train_with_columns = convertColumnAsInt(mitbih_train_with_columns, 'target')
getBarChartFromCategoryValueCounts(mitbih_train_with_columns['target'].value_counts())

In [None]:
mitbih_test = pd.read_csv(Path.get('mitbih_test'), header=None )
mitbih_test_with_columns = addColumnsToDataframe(mitbih_test)
mitbih_test_with_columns = convertColumnAsInt(mitbih_test_with_columns, 'target')
getBarChartFromCategoryValueCounts(mitbih_test_with_columns['target'].value_counts())

In [None]:
#train:
print(mitbih_train.shape)
print("The train data has 87554 and 188 columns.")

#test:
print(mitbih_test.shape)
print("The test data has 21891 and 188 columns.")

In [None]:
print("Missing values in train:", mitbih_train.isnull().sum())
print("Missing values in test:", mitbih_test.isnull().sum())
print("The data has no missing values.")

In [None]:
print(mitbih_train.dtypes)
print(mitbih_test.dtypes)
print("The features are numeric. All columns have the type float64, while the target column is int32.")

In [None]:
# Define your mapping dictionary
class_mapping = {
    0: 'Normal beat',
    1: 'Supraventricular premature beat',
    2: 'Premature ventricular contraction',
    3: 'Fusion of ventricular and normal beat',
    4: 'Unclassifiable beat'
}

# Convert target variable to categorical in both training and test datasets
mitbih_train['target'] = mitbih_train['target'].map(class_mapping)
mitbih_test['target'] = mitbih_test['target'].map(class_mapping)

In [None]:
# Drop rows where 'target_categorical' is 'Unclassifiable beat'
mitbih_train = mitbih_train[mitbih_train['target'] != 'Unclassifiable beat']
mitbih_test = mitbih_test[mitbih_test['target'] != 'Unclassifiable beat']

In [None]:
print(mitbih_train["target"])
print(mitbih_test["target"])

In [None]:
mitbih_train.head()

In [None]:
mitbih_test.head()

In [None]:
#training data
value_counts_series = mitbih_train['target'].value_counts()

# Define custom colors for each category
colors = ['green', 'orange', 'yellow','red']

plt.figure(figsize=(10, 6))
bar_chart = value_counts_series.plot(kind='bar', color=colors)
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Distribution of Target Categories (Training Dataset)')
plt.grid(False)
plt.xticks(rotation=90)
for i in bar_chart.containers:
    bar_chart.bar_label(i, label_type='edge')
plt.show()

In [None]:
# testing dataset

value_counts_series = mitbih_test['target'].value_counts()

# Define custom colors for each category
colors = ['green', 'orange', 'yellow','red']

plt.figure(figsize=(10, 6))
bar_chart = value_counts_series.plot(kind='bar', color=colors)
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Distribution of Target Categories (Testing Dataset)')
plt.grid(False)
plt.xticks(rotation=90)
for i in bar_chart.containers:
    bar_chart.bar_label(i, label_type='edge')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Function to plot overlay of ECG signals from both datasets for a single class
def plot_overlay_ecg_signals(df, label, color, dataset_label):
    sample_data = df[df['target'] == label].iloc[0]
    plt.plot(sample_data[:-1], label=f'{dataset_label}: {label}', color=color)

    plt.title(f"Overlay of ECG Signals - {dataset_label}")
    plt.ylabel('Amplitude')
    plt.grid(True)
    plt.xticks([])  # Remove x-axis ticks and labels
    plt.legend()

# List of classes to plot
classes_to_plot = ['Normal beat', 'Premature ventricular contraction', 
                   'Supraventricular premature beat', 'Fusion of ventricular and normal beat']

# Colors for each class
colors = ['green', 'orange', 'yellow', 'red']

# Plotting overlay for each class in both datasets
plt.figure(figsize=(12, 8))

# Plotting for training dataset
for label, color in zip(classes_to_plot, colors):
    plot_overlay_ecg_signals(mitbih_train, label, color, 'Training')

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))

# Plotting for testing dataset
for label, color in zip(classes_to_plot, colors):
    plot_overlay_ecg_signals(mitbih_test, label, color, 'Testing')

plt.tight_layout()
plt.show()

In [None]:
def plotCorrelationMatrixByClass(mitbih_train, target, graphWidth=12, tick_frequency=20):
    # Ensure mitbih_train is a DataFrame and has at least 2 columns
    if not isinstance(mitbih_train, pd.DataFrame) or mitbih_train.shape[1] < 2:
        print(f'Error: Invalid DataFrame provided. Expected at least 2 columns.')
        return
    
    # Check if target column exists in the DataFrame
    if target not in mitbih_train.columns:
        print(f'Error: Target variable "{target}" not found in DataFrame columns.')
        return
    
    # Exclude the target variable from the DataFrame
    df_without_target = mitbih_train.drop(columns=[target])
    
    # Get unique classes of the target variable
    unique_classes = mitbih_train[target].unique()
    
    # Iterate through unique classes and plot correlation matrix for each
    for cls in unique_classes:
        # Subset the data for the current class
        df_class = df_without_target[mitbih_train[target] == cls]
        
        # Drop columns with NaN and keep those with more than 1 unique value
        df_class = df_class.dropna(axis='columns', how='any')
        df_class = df_class[[col for col in df_class if df_class[col].nunique() > 1]]
        
        # Check if there are enough columns left for correlation calculation
        if df_class.shape[1] < 2:
            print(f'No correlation plots shown for class "{cls}": The number of non-NaN or constant columns ({df_class.shape[1]}) is less than 2')
            continue
        
        # Calculate correlation matrix
        corr = df_class.corr()
        
        # Plotting settings with increased figure size
        plt.figure(figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
        corrMat = plt.matshow(corr, fignum=1)
        
        # Set x and y tick labels every tick_frequency-th label
        plt.xticks(range(0, len(corr.columns), tick_frequency), corr.columns[::tick_frequency], rotation=90)
        plt.yticks(range(0, len(corr.columns), tick_frequency), corr.columns[::tick_frequency])
        
        plt.gca().xaxis.tick_bottom()
        plt.colorbar(corrMat)
        plt.title(f'Correlation Matrix for Target Class: {cls}', fontsize=15)
        plt.show()

# Beispielanwendung mit dem DataFrame mitbih_train und der Zielvariablen 'target'
plotCorrelationMatrixByClass(mitbih_train, target='target', graphWidth=15, tick_frequency=20)