
# Default Preprocess For MIT data 
The default preprocessing step is what we conclude in our report. 
You can check below what should be input for this notebook and what would be the output if necessary.

Processing **steps** for MIT dataset:   
    resample: SMOTE   
    rescaling: StandardScaler  


**Input** : the original data.   
mitbih_test.csv
mitbih_train.csv

**Output** : The cleaning data.   
mitbih_train_clean_default.csv
mitbih_test_clean_default.csv


In [1]:
import sys
import os
data_path = ''
data_output_path = ''
# Check if the environment is Google Colab
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q


    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    data_output_path = data_path
    
else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/raw/'
    data_output_path = '../data/processed/'


Running on local environment
Current working directory: /Users/pingyuan/Documents/codeself/heartbeat-analysis-ai/notebooks


In [2]:
# Verify installation and import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


RawFiles = dict({
    'test': data_path +  'mitbih_test.csv',
    'train': data_path +  'mitbih_train.csv',
})

OutputFiles = dict({
    'test': data_output_path +  'mitbih_test_clean_default.csv',
    'train': data_output_path +  'mitbih_train_clean_default.csv',
})

In [3]:
def addColumnsToDataframe(df):
    """
    As the dataset is composed with 188 columns with the 188th columns as the category values,
    so we give the last column the name 'target', others named with 'c_182'
    """
    num_columns= df.shape[1]
    feature_col_name = ['c_' + str(i) for i in range(0, num_columns - 1)]
    df_columns = feature_col_name + ['target']
    df.columns = df_columns
    return df

def convertColumnAsInt(df, column):
    """
    As the category value is in float type. We want to get the int to identify the category.
    """
    df[column] = df[column].astype(int)
    return df


In [4]:

mitbih_train = pd.read_csv(RawFiles.get('train'), header=None ) 
mitbih_test = pd.read_csv(RawFiles.get('test'), header=None )

mitbih_train = addColumnsToDataframe(mitbih_train)
mitbih_train = convertColumnAsInt(mitbih_train, 'target')

mitbih_test = addColumnsToDataframe(mitbih_test)
mitbih_test = convertColumnAsInt(mitbih_test, 'target')

# target value and meanings
class_mapping = {
    0: 'Normal',
    1: 'Supraventricular',
    2: 'Ventricular',
    3: 'Fusion',
    4: 'Unclassifiable'
}

#drop null value  
mitbih_train = mitbih_train.dropna(how='any')
mitbih_test = mitbih_test.dropna(how='any')

#split train test set before resample
y_train = mitbih_train['target']
X_train = mitbih_train.drop(columns=['target'], inplace=False)
y_test = mitbih_test['target']
X_test = mitbih_test.drop(columns=['target'], inplace=False)

#resampling with SMOTE before rescaler
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#rescaler with StandardScaler 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)




In [19]:
# convert resampling rescaling data back to dataframe to concat
X_train_scaled_df =  pd.DataFrame(X_train_scaled, columns=[f'c_{i}' for i in range(X_train_scaled.shape[1])])
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f'c_{i}' for i in range(X_test_scaled.shape[1])])
y_train_resampled_df = pd.DataFrame(y_train_resampled, columns=['target'])

# concat X_train, y_train/ X_test, y_test
mitbih_train_clean_default = pd.concat(
    [
        X_train_scaled_df,
        y_train_resampled_df
    ], axis=1)

mitbih_test_clean_default = pd.concat(
    [  
        X_test_scaled_df,
        y_test.reset_index(drop=True)
    ], axis=1)


In [20]:
#save clean data to 
# mitbih_train_clean_default.csv
# mitbih_test_clean_default.csv

mitbih_train_clean_default.to_csv(OutputFiles.get('train'), index=False)
mitbih_test_clean_default.to_csv(OutputFiles.get('test'), index=False)