In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
# Set up paths
RAW_DATA_PATH = 'data/raw'
PROCESSED_DATA_PATH = 'data/processed'
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [4]:
def load_dataset(filename):
    """Load dataset from raw folder"""
    filepath = os.path.join(RAW_DATA_PATH, filename)
    if filename.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filename.endswith('.xlsx'):
        return pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file format")

In [None]:
def save_processed_data(df, filename):
    """Save cleaned dataset to processed folder"""
    filepath = os.path.join(PROCESSED_DATA_PATH, filename)
    df.to_csv(filepath, index=False)


In [None]:
def clean_diabetes_data(df):
    """Custom cleaning for diabetes dataset based on EDA findings"""
    # Convert age from float to int (ages should be whole numbers)
    df['age'] = df['age'].astype(int)
    
    # Clean smoking_history categories
    df['smoking_history'] = df['smoking_history'].replace({
        'No Info': 'Unknown',
        'never': 'Never',
        'current': 'Current',
        'former': 'Former',
        'not current': 'Not current',
        'ever': 'Ever'
    })
    
    # Handle potential outliers in BMI (clip extreme values)
    df['bmi'] = df['bmi'].clip(lower=15, upper=50)
    
    # Create age groups feature
    bins = [0, 18, 30, 45, 60, 80]
    labels = ['0-18', '19-30', '31-45', '46-60', '60+']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    # Standardize column names
    df.columns = [col.lower() for col in df.columns]
    
    return df

In [None]:
def clean_stroke_data(df):
    """Custom cleaning for stroke dataset based on EDA findings"""
    # Drop ID column as it's not useful for modeling
    df = df.drop('id', axis=1)
    
    # Handle missing BMI values (201 missing)
    # Impute with median by age group
    df['age_group'] = pd.cut(df['age'], 
                            bins=[0, 18, 30, 45, 60, 100],
                            labels=['0-18', '19-30', '31-45', '46-60', '60+'])
    df['bmi'] = df.groupby('age_group')['bmi'].apply(
        lambda x: x.fillna(x.median()))
    
    # Clean smoking_status categories
    df['smoking_status'] = df['smoking_status'].replace({
        'formerly smoked': 'Former',
        'never smoked': 'Never',
        'smokes': 'Current',
        'Unknown': 'Unknown'
    })
    
    # Convert categorical variables to lowercase
    categorical_cols = ['gender', 'ever_married', 'work_type', 
                       'residence_type', 'smoking_status']
    for col in categorical_cols:
        df[col] = df[col].str.lower()
    
    return df

In [None]:
def clean_heart_data(df):
    """Custom cleaning for heart disease dataset based on EDA findings"""
    # Drop columns with excessive missing values
    df = df.drop(['slope', 'ca', 'thal'], axis=1)
    
    # Handle missing values in other columns
    # For numerical columns, impute with median
    num_cols = ['trestbps', 'chol', 'thalch', 'oldpeak']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # For categorical columns, impute with mode
    cat_cols = ['fbs', 'restecg', 'exang']
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Clean categorical variables
    df['cp'] = df['cp'].str.replace('-', ' ').str.lower()
    df['restecg'] = df['restecg'].str.lower()
    df['exang'] = df['exang'].astype(str).str.lower()
    
    # Handle biological impossibilities (0 values for cholesterol and blood pressure)
    df['chol'] = df['chol'].replace(0, df['chol'].median())
    df['trestbps'] = df['trestbps'].replace(0, df['trestbps'].median())
    
    # Create age groups
    bins = [0, 30, 40, 50, 60, 70, 100]
    labels = ['<30', '30-39', '40-49', '50-59', '60-69', '70+']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    return df