In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
# Set up paths
RAW_DATA_PATH = 'data/raw'
PROCESSED_DATA_PATH = 'data/processed'
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [4]:
def load_dataset(filename):
    """Load dataset from raw folder"""
    filepath = os.path.join(RAW_DATA_PATH, filename)
    if filename.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filename.endswith('.xlsx'):
        return pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file format")

In [None]:
def save_processed_data(df, filename):
    """Save cleaned dataset to processed folder"""
    filepath = os.path.join(PROCESSED_DATA_PATH, filename)
    df.to_csv(filepath, index=False)


In [None]:
def clean_diabetes_data(df):
    """Custom cleaning for diabetes dataset based on EDA findings"""
    # Convert age from float to int (ages should be whole numbers)
    df['age'] = df['age'].astype(int)
    
    # Clean smoking_history categories
    df['smoking_history'] = df['smoking_history'].replace({
        'No Info': 'Unknown',
        'never': 'Never',
        'current': 'Current',
        'former': 'Former',
        'not current': 'Not current',
        'ever': 'Ever'
    })
    
    # Handle potential outliers in BMI (clip extreme values)
    df['bmi'] = df['bmi'].clip(lower=15, upper=50)
    
    # Create age groups feature
    bins = [0, 18, 30, 45, 60, 80]
    labels = ['0-18', '19-30', '31-45', '46-60', '60+']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)
    
    # Standardize column names
    df.columns = [col.lower() for col in df.columns]
    
    return df