# Preprocess For PTB data clean

Processing steps for PTB dataset: no resampling no scaler


**Input** : raw data:  
ptbdb_normal.csv  
ptbdb_abnormal.csv  

**Output** : clearn data:   
ptb_train_clean.csv  
ptb_test_clean.csv

In [1]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: /Users/pingyuan/Documents/codeself/heartbeat-analysis-ai/notebooks


In [2]:


RawFiles = dict({
    'normal': data_path +  'ptbdb_normal.csv',
    'abnormal': data_path +  'ptbdb_abnormal.csv',
})

OutputFiles = dict({
    'test': data_output_path +  'ptb_test_clean.csv',
    'train': data_output_path +  'ptb_train_clean.csv',
})

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df
def convertColumnAsInt(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce') # convert to numeric to handle NaN values
    df.dropna(subset=[column], inplace=True)  # drop the rows with NaN values
    df[column] = df[column].astype(int)  # convert to int
    return df

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

ptb_normal = pd.read_csv(RawFiles.get('normal'), header=None)
ptb_abnormal = pd.read_csv(RawFiles.get('abnormal'), header=None)

ptb_data = pd.concat([ptb_normal, ptb_abnormal], axis=0, ignore_index=True)
ptb_data = ptb_data.sample(frac=1).reset_index(drop=True)

ptb_data = addColumnsToDataframe(ptb_data)  
ptb_data = convertColumnAsInt(ptb_data, 'target') 

ptb_data = ptb_data.dropna(axis=0)
y = ptb_data['target']
X = ptb_data.drop(columns=['target'], inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# convert resampling rescaling data back to dataframe to concat
X_train_df = pd.DataFrame(X_train, columns=[f'c_{i}' for i in range(X_train.shape[1])])
X_test_df = pd.DataFrame(X_test, columns=[f'c_{i}' for i in range(X_test.shape[1])])
y_train_df = pd.DataFrame(y_train, columns=['target'])
y_test_df = pd.DataFrame(y_test, columns=['target'])

# concat X_train, y_train(reset index to avoid join)
ptb_train_clean = pd.concat(
    [
        X_train_df, y_train_df
    ], axis=1)

ptb_test_clean = pd.concat(
    [  
        X_test_df, y_test_df.reset_index(drop=True)
    ], axis=1)

print('train shape', ptb_train_clean.shape)
print('test shape', ptb_test_clean.shape)

train shape (11641, 188)
test shape (5230, 188)


In [7]:
#save clean data to 
# ptb_train_clean.csv  
# ptb_test_clean.csv

ptb_train_clean.to_csv(OutputFiles.get('train'), index=False)
ptb_test_clean.to_csv(OutputFiles.get('test'), index=False)


In [8]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())

Current time: 2024-11-07 11:59:24.850238
