
# Default Preprocess For PTB data 
The default preprocessing step is what we conclude in our report. 
You can check below what should be input for this notebook and what would be the output if necessary.

Processing **steps** for PTB dataset:   
    resample: Oversampling  
    rescaling: StandardScaler  


**Input** : raw data:
ptbdb_normal.csv
ptbdb_abnormal.csv

**Output** : Sampled and Scaled data:   
ptb_train_clean_standard_oversampling.csv  
ptb_test_clean_standard_oversampling.csv


In [12]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: g:\Meine Ablage\heartbeat-analysis-ai\notebooks


In [13]:
# Verify installation and import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler


RawFiles = dict({
    'normal': data_path +  'ptbdb_normal.csv',
    'abnormal': data_path +  'ptbdb_abnormal.csv',
})

OutputFiles = dict({
    'test': data_output_path +  'ptb_test_clean_standard_oversamling.csv',
    'train': data_output_path +  'ptb_train_clean_standard_oversampling.csv',
})

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Lesen der Dateien ein
ptb_normal = pd.read_csv(RawFiles.get('normal'), header=None)
ptb_abnormal = pd.read_csv(RawFiles.get('abnormal'), header=None)

# Kombinieren der beiden DataFrames
ptb_data = pd.concat([ptb_normal, ptb_abnormal], axis=0, ignore_index=True)

# Annahme: Die Funktionen `addColumnsToDataframe` und `convertColumnAsInt` sind vorhanden
ptb_data = addColumnsToDataframe(ptb_data)
ptb_data = convertColumnAsInt(ptb_data, 'target')

# Entfernen von fehlenden Werten
ptb_data = ptb_data.dropna(axis=0)

# Oversampling anwenden, um die Klassen auszugleichen
X = ptb_data.drop(columns=['target'])
y = ptb_data['target']

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Zusammenführen der resampled Daten in einen DataFrame
ptb_data = pd.concat([X_resampled, y_resampled], axis=1)

# Skalieren der Daten mit StandardScaler
scaler = StandardScaler()

X = ptb_data.drop(columns=['target'])
X_scaled = scaler.fit_transform(X)
y = ptb_data['target']

# Splitten der Daten in Trainings- und Test-Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Spaltenüberschriften definieren
columns = [f'c_{i}' for i in range(X.shape[1])] + ['target']

# Umwandlung der skalierten Daten in Pandas DataFrames mit Spaltenüberschriften
X_train_scaled_df = pd.DataFrame(X_train, columns=columns[:-1])
X_test_scaled_df = pd.DataFrame(X_test, columns=columns[:-1])

# Zielspalten in DataFrames umwandeln und den Namen "target" zuweisen
y_train = pd.DataFrame(y_train, columns=['target']).reset_index(drop=True)
y_test = pd.DataFrame(y_test, columns=['target']).reset_index(drop=True)

# Zusammenführen von X_train_scaled und y_train sowie X_test_scaled und y_test
ptb_train_clean_default = pd.concat([X_train_scaled_df, y_train], axis=1)
ptb_test_clean_default = pd.concat([X_test_scaled_df, y_test], axis=1)

# Sicherstellen, dass die Spaltenüberschriften korrekt sind
assert list(ptb_train_clean_default.columns) == columns, "Trainingsdaten haben nicht die erwarteten Überschriften!"
assert list(ptb_test_clean_default.columns) == columns, "Testdaten haben nicht die erwarteten Überschriften!"

# Speichern der Trainings- und Testdaten mit Header
# Trainingsdaten mit Header speichern
ptb_train_clean_default.to_csv(OutputFiles.get('train'), index=False, header=True)

# Testdaten mit Header speichern
ptb_test_clean_default.to_csv(OutputFiles.get('test'), index=False, header=True)

# Statusmeldung
print("Data Preprocessing is done and saved to the output folder")
print("Train data shape: ", ptb_train_clean_default.shape)
print("Test data shape: ", ptb_test_clean_default.shape)
print("Data saved to: ", OutputFiles.get('train'))
print("Data saved to: ", OutputFiles.get('test'))


Data Preprocessing is done and saved to the output folder
Train data shape:  (16809, 188)
Test data shape:  (4203, 188)
Data saved to:  ../data/processed/ptb_train_clean_standard_oversampling.csv
Data saved to:  ../data/processed/ptb_test_clean_standard_oversamling.csv


In [6]:
# as after scaler, the X_train_scaled X_test_scaled is numpy.ndarray
# convert numpy.ndarray to pandas again with header before concat X_train y_train

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f'c_{i}' for i in range(X.shape[1])])
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f'c_{i}' for i in range(X.shape[1])])


# concat X_train, y_train(reset index to avoid join)
ptb_train_clean_default = pd.concat(
    [
        X_train_scaled_df, y_train.reset_index(drop=True) 
    ], axis=1)

ptb_test_clean_default = pd.concat(
    [  
        X_test_scaled_df, y_test.reset_index(drop=True)
    ], axis=1)


NameError: name 'X_train_scaled' is not defined

In [6]:
ptb_train_clean_default.shape[0]

11641

In [7]:
#save clean data to 
# ptb_train_clean_default.csv
# ptb_test_clean_default.csv
ptb_train_clean_default.to_csv(OutputFiles.get('train'), index=False)
ptb_test_clean_default.to_csv(OutputFiles.get('test'), index=False)
