
# Default Preprocess For PTB data 
The default preprocessing step is what we conclude in our report. 
You can check below what should be input for this notebook and what would be the output if necessary.

Processing **steps** for PTB dataset:   
    resample: None   
    rescaling: StandardScaler  


**Input** : the original data.   
ptbdb_normal.csv  
ptbdb_abnormal.csv  

**Output** : The cleaning data.   
ptb_train_clean_default.csv  
ptb_test_clean_default.csv  



In [54]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: /Users/pingyuan/Documents/codeself/heartbeat-analysis-ai/notebooks


In [55]:
# Verify installation and import libraries


import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


RawFiles = dict({
    'normal': data_path +  'ptbdb_normal.csv',
    'abnormal': data_path +  'ptbdb_abnormal.csv',
})

OutputFiles = dict({
    'test': data_output_path +  'ptb_test_clean_default.csv',
    'train': data_output_path +  'ptb_train_clean_default.csv',
})

In [56]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df


In [57]:

ptb_normal = pd.read_csv(RawFiles.get('normal'), header=None ) 
ptb_abnormal = pd.read_csv(RawFiles.get('abnormal'), header=None )

ptb_data = pd.concat([ptb_normal, ptb_abnormal], axis=0, ignore_index=True)

ptb_data = addColumnsToDataframe(ptb_data)
ptb_data = convertColumnAsInt(ptb_data, 'target')


#drop null value  
ptb_data = ptb_data.dropna(how='any')

#no resampling for ptb data

#rescaler with StandardScaler 
y = ptb_data['target']
X = ptb_data.drop(columns=['target'], inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#save clean data to 
# ptb_train_clean_default.csv
# ptb_test_clean_default.csv

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f'c_{i}' for i in range(X.shape[1])])
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f'c_{i}' for i in range(X.shape[1])])
y_train_df = pd.DataFrame(y_train, columns=['target'])
y_test_df = pd.DataFrame(y_test, columns=['target'])


ptb_train_clean_default = pd.concat(
    [
        X_train_scaled_df, y_train_df
    ], axis=1)

ptb_test_clean_default = pd.concat(
    [  
        X_test_scaled_df, y_test_df
    ], axis=1)

ptb_train_clean_default.to_csv(OutputFiles.get('train'), index=False)
ptb_test_clean_default.to_csv(OutputFiles.get('test'), index=False)
