In [126]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [127]:
def save_processed_data(df, filename):
    """Save cleaned dataset to processed folder"""
    # Ensure the processed directory exists
    processed_dir = '/workspace/COMP-3608---PROJECT/data/processed'
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    # Save the DataFrame to a CSV file in the specified directory
    filepath = os.path.join(processed_dir, filename)
    df.to_csv(filepath, index=False)
    print(f"File saved to: {filepath}")


In [128]:
def clean_diabetes_data(df):
    """Custom cleaning for diabetes dataset based on EDA findings"""
    # Convert age from float to int (ages should be whole numbers)
    df['age'] = df['age'].astype(int)
    
    # Clean smoking_history categories
    df['smoking_history'] = df['smoking_history'].replace({
        'No Info': 'Unknown',
        'never': 'Never',
        'current': 'Current',
        'former': 'Former',
        'not current': 'Not current',
        'ever': 'Ever'
    })
    
    # Handle potential outliers in BMI (clip extreme values)
    df['bmi'] = df['bmi'].clip(lower=15, upper=50)
    
    # Create age groups feature
    bins = [0, 18, 30, 45, 60, 80]
    labels = ['0-18', '19-30', '31-45', '46-60', '60+']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    # Standardize column names
    df.columns = [col.lower() for col in df.columns]
    
    return df

In [129]:
def clean_stroke_data(df):
    """Custom cleaning for stroke dataset based on EDA findings"""
    
    # Drop ID column as it's not useful for modeling
    df = df.drop('id', axis=1)
    
    # Handle missing BMI values (201 missing)
    # Impute with median by age group
    df['age_group'] = pd.cut(df['age'], 
                             bins=[0, 18, 30, 45, 60, 100],
                             labels=['0-18', '19-30', '31-45', '46-60', '60+'])
    
    # Impute missing BMI values by grouping with median BMI per age group
    # Use `transform` instead of `apply` to fill missing BMI more efficiently
    df['bmi'] = df.groupby('age_group')['bmi'].transform(lambda x: x.fillna(x.median()))
    
    # Clean smoking_status categories
    df['smoking_status'] = df['smoking_status'].replace({
        'formerly smoked': 'Former',
        'never smoked': 'Never',
        'smokes': 'Current',
        'Unknown': 'Unknown'
    })
    
    # Convert categorical variables to lowercase
    categorical_cols = ['gender', 'ever_married', 'work_type', 
                        'Residence_type', 'smoking_status']
    for col in categorical_cols:
        df[col] = df[col].str.lower()
    
    # Ensure all columns are correctly typed after transformations
    df['age_group'] = df['age_group'].astype('category')  # Convert 'age_group' to categorical
    df['smoking_status'] = df['smoking_status'].astype('category')  # Convert 'smoking_status' to categorical
    
    return df


In [130]:
def clean_heart_data(df):
    """Custom cleaning for heart disease dataset based on EDA findings"""
    # Drop columns with excessive missing values
    df = df.drop(['slope', 'ca', 'thal'], axis=1)
    
    # Handle missing values in other columns
    # For numerical columns, impute with median
    num_cols = ['trestbps', 'chol', 'thalch', 'oldpeak']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # For categorical columns, impute with mode
    cat_cols = ['fbs', 'restecg', 'exang']
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Clean categorical variables
    df['cp'] = df['cp'].str.replace('-', ' ').str.lower()
    df['restecg'] = df['restecg'].str.lower()
    df['exang'] = df['exang'].astype(str).str.lower()
    
    # Handle biological impossibilities (0 values for cholesterol and blood pressure)
    df['chol'] = df['chol'].replace(0, df['chol'].median())
    df['trestbps'] = df['trestbps'].replace(0, df['trestbps'].median())
    
    # Create age groups
    bins = [0, 30, 40, 50, 60, 70, 100]
    labels = ['<30', '30-39', '40-49', '50-59', '60-69', '70+']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    return df

In [131]:
# Process Diabetes Dataset
diabetes_df = pd.read_csv('/workspace/COMP-3608---PROJECT/data/raw/diabetes_prediction_dataset.csv')
diabetes_clean = clean_diabetes_data(diabetes_df)
save_processed_data(diabetes_clean, 'processed_diabetes.csv')


File saved to: /workspace/COMP-3608---PROJECT/data/processed/processed_diabetes.csv


In [132]:
# Process Heart Disease Dataset
heart_df = pd.read_csv('/workspace/COMP-3608---PROJECT/data/raw/heart_disease_uci.csv')
heart_clean = clean_heart_data(heart_df)
save_processed_data(heart_clean, 'processed_heart_disease.csv')

File saved to: /workspace/COMP-3608---PROJECT/data/processed/processed_heart_disease.csv


  df[col] = df[col].fillna(df[col].mode()[0])


In [133]:
# Process Stroke Dataset
stroke_df = load_dataset('/workspace/COMP-3608---PROJECT/data/raw/healthcare-dataset-stroke-data.csv')
stroke_clean = clean_stroke_data(stroke_df)
save_processed_data(stroke_clean, 'processed_stroke.csv')

File saved to: /workspace/COMP-3608---PROJECT/data/processed/processed_stroke.csv


  df['bmi'] = df.groupby('age_group')['bmi'].transform(lambda x: x.fillna(x.median()))
