
# Default Preprocess For PTB data 
The default preprocessing step is what we conclude in our report. 
You can check below what should be input for this notebook and what would be the output if necessary.

Processing **steps** for PTB dataset:   
    resample: Oversampling  
    rescaling: StandardScaler  


**Input** : raw data:  
ptbdb_normal.csv  
ptbdb_abnormal.csv  

**Output** : Sampled and Scaled data:   
ptb_train_clean_standard_oversampling.csv  
ptb_test_clean_standard_oversampling.csv


In [1]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: g:\Meine Ablage\heartbeat-analysis-ai\notebooks


In [2]:
# Verify installation and import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler


RawFiles = dict({
    'normal': data_path +  'ptbdb_normal.csv',
    'abnormal': data_path +  'ptbdb_abnormal.csv',
})

OutputFiles = dict({
    'test': data_output_path +  'ptb_test_clean_standard_oversampling.csv',
    'train': data_output_path +  'ptb_train_clean_standard_oversampling.csv',
})

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

ptb_normal = pd.read_csv(RawFiles.get('normal'), header=None)
ptb_abnormal = pd.read_csv(RawFiles.get('abnormal'), header=None)

ptb_data = pd.concat([ptb_normal, ptb_abnormal], axis=0, ignore_index=True)
ptb_data = ptb_data.sample(frac=1).reset_index(drop=True)

ptb_data = addColumnsToDataframe(ptb_data)  
ptb_data = convertColumnAsInt(ptb_data, 'target') 

ptb_data = ptb_data.dropna(axis=0)


y = ptb_data['target']
X = ptb_data.drop(columns=['target'], inplace=False)

# split train test set before resampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

randomOverSampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = randomOverSampler.fit_resample(X_train, y_train)

# rescaler with StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# convert resampling rescaling data back to dataframe to concat
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f'c_{i}' for i in range(X_train_scaled.shape[1])])
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f'c_{i}' for i in range(X_test_scaled.shape[1])])
y_train_resampled_df = pd.DataFrame(y_train_resampled, columns=['target'])
y_test_df = pd.DataFrame(y_test, columns=['target'])

# concat X_train, y_train(reset index to avoid join)
ptb_train_clean_standard_oversampling = pd.concat(
    [
        X_train_scaled_df, y_train_resampled_df
    ], axis=1)

ptb_test_clean_standard_oversampling = pd.concat(
    [  
        X_test_scaled_df, y_test_df.reset_index(drop=True)
    ], axis=1)



In [5]:
# save clean data to 
# Speichern der Trainings- und Testdaten mit Header
ptb_train_clean_standard_oversampling.to_csv(OutputFiles.get('train'), index=False, header=True)
ptb_test_clean_standard_oversampling.to_csv(OutputFiles.get('test'), index=False, header=True)

# show some info
print("Data Preprocessing is done and saved to the output folder")
print("Train data shape: ", ptb_train_clean_standard_oversampling.shape)
print("Test data shape: ", ptb_test_clean_standard_oversampling.shape)
print("Data saved to: ", OutputFiles.get('train'))
print("Data saved to: ", OutputFiles.get('test'))

Data Preprocessing is done and saved to the output folder
Train data shape:  (16812, 188)
Test data shape:  (2911, 188)
Data saved to:  ../data/processed/ptb_train_clean_standard_oversampling.csv
Data saved to:  ../data/processed/ptb_test_clean_standard_oversampling.csv


In [6]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())

Current time: 2024-11-13 10:05:21.141005
