
# Preprocessing for MIT data(Binary classification) clean without resampling or rescaler


Processing **steps** for MIT dataset:  no resampling no scaler


**Input** : Cleaned data with dummy target variable.   
mitbih_test.csv   
mitbih_train.csv

**Output** : clean data:   
mitbih_train_clean.csv  
mitbih_test_clean.csv


In [1]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: /Users/pingyuan/Documents/codeself/heartbeat-analysis-ai/notebooks


In [2]:
# Verify installation and import libraries
import pandas as pd


RawFiles = dict({
    'test': data_path +  'mitbih_test.csv', # MIT data set in which we have dummy-coded the target variable
    'train': data_path +  'mitbih_train.csv', # MIT data set in which we have dummy-coded the target variable
})

OutputFiles = dict({
    'test': data_output_path + 'mitbih_test_clean.csv',
    'train': data_output_path + 'mitbih_train_clean.csv',
})

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df
def convertColumnAsInt(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce') # convert to numeric to handle NaN values
    df.dropna(subset=[column], inplace=True)  # drop the rows with NaN values
    df[column] = df[column].astype(int)  # convert to int
    return df

In [4]:
mitbih_train = pd.read_csv(RawFiles.get('train'), header=None ) 
mitbih_test = pd.read_csv(RawFiles.get('test'), header=None )

mitbih_train = addColumnsToDataframe(mitbih_train)
mitbih_train = convertColumnAsInt(mitbih_train, 'target')

mitbih_test = addColumnsToDataframe(mitbih_test)
mitbih_test = convertColumnAsInt(mitbih_test, 'target')

# target value and meanings
all_class_mapping = {
    0: 'Normal',
    1: 'Supraventricular',
    2: 'Ventricular',
    3: 'Fusion',
    4: 'Unclassifiable'
}
mitbih_train['target'] = mitbih_train['target'].map(all_class_mapping)
mitbih_test['target'] = mitbih_test['target'].map(all_class_mapping)
# Drop rows where 'target' is 'Unclassifiable beat'
mitbih_train = mitbih_train[mitbih_train['target'] != 'Unclassifiable']
mitbih_test = mitbih_test[mitbih_test['target'] != 'Unclassifiable']


# convert to binary classification Combine abnormal categories
mitbih_train['target'] = mitbih_train['target'].replace(['Supraventricular', 'Ventricular', 'Fusion'], 'abnormal')
mitbih_test['target'] = mitbih_test['target'].replace(['Supraventricular', 'Ventricular', 'Fusion'], 'abnormal')

# Encode the labels: normal as 0, abnormal as 1
mitbih_train['target'] = mitbih_train['target'].replace({'Normal': 0, 'abnormal': 1})
mitbih_test['target'] = mitbih_test['target'].replace({'Normal': 0, 'abnormal': 1})

#drop null value  
mitbih_train = mitbih_train.dropna(how='any')
mitbih_test = mitbih_test.dropna(how='any')




  mitbih_train['target'] = mitbih_train['target'].replace({'Normal': 0, 'abnormal': 1})
  mitbih_test['target'] = mitbih_test['target'].replace({'Normal': 0, 'abnormal': 1})


In [5]:
#save clean data to 
# mitbih_train_clean.csv  
# mitbih_test_clean.csv

mitbih_train.to_csv(OutputFiles.get('train'), index=False)
mitbih_test.to_csv(OutputFiles.get('test'), index=False)


In [6]:
from datetime import datetime
# Display the running time
print("Current time:", datetime.now())

Current time: 2024-11-05 10:57:23.113150
