In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')
import joblib
import h5py
import json
import os
import time

# Step 0, chek dataset availability

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

folder_path = 'dataset/health_data1/'

try:
    print("Daftar file di dalam folder:", os.listdir(folder_path))
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"File yang hilang: {', '.join(missing_files)}")
    else:
        print("Semua file dataset ditemukan.")

except FileNotFoundError:
    print(f"Folder tidak ditemukan: {folder_path}")

Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- combined_dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  e

In [2]:
# Step 1, preprocess available data and merge it in the end
columns = [
    "height", "weight", "gender", "age", "bp", "bc", "bg", "bmi", "sodium", 
    "fat", "protein", "carbs", "anemia", "cholesterol", "ckd", "diabetes", 
    "heart", "hypertension", "ms", "nafld", "obesity", "stroke"
]

def create_data_dict(**kwargs):
    base_dict = {
        "height": np.nan, "weight": np.nan, "gender": np.nan, "age": np.nan,
        "bp": np.nan, "bc": np.nan, "bg": np.nan, "bmi": np.nan,
        "sodium": np.nan, "fat": np.nan, "protein": np.nan, "carbs": np.nan,
        "anemia": 0, "cholesterol": 0, "ckd": 0, "diabetes": 0,
        "heart": 0, "hypertension": 0, "ms": 0, "nafld": 0, "obesity": 0, "stroke": 0
    }
    base_dict.update({k: v for k, v in kwargs.items() if v is not None})
    return base_dict

all_data = []

# 1. Anemia dataset
anemia_data = pd.read_csv(os.path.join(folder_path, "anemia-dataset.csv"))
all_data.extend([
    create_data_dict(
        gender=1 if row["Gender"] == "Male" else 0,
        bg=round(row["Hemoglobin"] * 7, 1),
        anemia=row["Result"]
    )
    for _, row in anemia_data.iterrows()
])

# 2. Cholesterol dataset
chol_data = pd.read_csv(os.path.join(folder_path, "cholesterol-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["sex"],
        bp=row["trestbps"],
        bc=row["chol"],
        bg=120 if row["fbs"] == 1 else 100,
        cholesterol=1 if row["chol"] > 240 else 0
    )
    for _, row in chol_data.iterrows()
])

# 3. Chronic Kidney Disease dataset
ckd_data = pd.read_csv(os.path.join(folder_path, "chronic-kidney-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        bp=row["bp"],
        bg=row["bgr"],
        sodium=row["sod"],
        anemia=1 if row["ane"] == "yes" else 0,
        ckd=1 if row["classification"] == "ckd" else 0,
        diabetes=1 if row["dm"] == "yes" else 0,
        heart=1 if row["cad"] == "yes" else 0,
        hypertension=1 if row["htn"] == "yes" else 0
    )
    for _, row in ckd_data.iterrows()
])

# 4. Diabetes dataset
diabetes_data = pd.read_csv(os.path.join(folder_path, "diabetes-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bmi=row["BMI"],
        cholesterol=row["HighChol"],
        diabetes=1 if row["Diabetes"] == 1 else 0,
        hypertension=row["HighBP"]
    )
    for _, row in diabetes_data.iterrows()
])

# 5. Heart Disease dataset
heart_data = pd.read_csv(os.path.join(folder_path, "heart-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        bg=120 if row["fbs"] == 1 else 100,
        heart=1 if row["target"] == 1 else 0
    )
    for _, row in heart_data.iterrows()
])

# 6. Hypertension dataset
hypertension_data = pd.read_csv(os.path.join(folder_path, "hypertension-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        hypertension=1 if row["target"] == 1 else 0
    )
    for _, row in hypertension_data.iterrows()
])

# 7. Metabolic Syndrome dataset
ms_data = pd.read_csv(os.path.join(folder_path, "metabolic-syndrome-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bg=row["BloodGlucose"],
        bmi=row["BMI"],
        ms=1 if row["MetabolicSyndrome"] == 1 else 0
    )
    for _, row in ms_data.iterrows()
])

# 8. NAFLD dataset
nafld_data = pd.read_csv(os.path.join(folder_path, "nafld1-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["male"],
        weight=row["weight"],
        height=row["height"],
        bmi=round(row["bmi"],1),
        nafld=row["status"]
    )
    for _, row in nafld_data.iterrows()
])

# 9. Obesity dataset
obesity_data = pd.read_csv(os.path.join(folder_path, "obesity-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Gender"] == "Male" else 0,
        weight=row["Weight"],
        height=row["Height"],
        bmi=row["BMI"],
        obesity=1 if row["Label"] == "Obesity" else 0
    )
    for _, row in obesity_data.iterrows()
])

# 10. Stroke dataset
stroke_data = pd.read_csv(os.path.join(folder_path, "stroke-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == "Male" else 0,
        bc=row["heart_disease"],
        bmi=row["bmi"],
        heart=row["heart_disease"],
        hypertension=row["hypertension"],
        stroke=row["stroke"]
    )
    for _, row in stroke_data.iterrows()
])

combined_data = pd.DataFrame(all_data)

output_path = os.path.join(folder_path, "combined_dataset.csv")
combined_data.to_csv(output_path, index=False)

print(f"Combined dataset succesfully savd on {output_path}")
print("\ncombined dataset inform:")
print(combined_data.info())
print("\ncombined dataset stats:")
print(combined_data.describe())

Combined dataset succesfully savd on dataset/health_data1/combined_dataset.csv

combined dataset inform:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160892 entries, 0 to 160891
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   height        14489 non-null   float64
 1   weight        12871 non-null   float64
 2   gender        160492 non-null  float64
 3   age           159462 non-null  float64
 4   bp            27799 non-null   float64
 5   bc            68321 non-null   float64
 6   bg            5506 non-null    float64
 7   bmi           126673 non-null  float64
 8   sodium        313 non-null     float64
 9   fat           0 non-null       float64
 10  protein       0 non-null       float64
 11  carbs         0 non-null       float64
 12  anemia        160892 non-null  float64
 13  cholesterol   160892 non-null  float64
 14  ckd           160892 non-null  int64  
 15  diabetes      160892 non-null  

In [3]:
def add_engineered_features(X):
    # Menambah fitur interaksi
    X_new = X.copy()
    
    # BMI categories
    X_new['bmi_category'] = pd.cut(X_new['bmi'], 
                                  bins=[0, 18.5, 25, 30, 100],
                                  labels=[0, 1, 2, 3])
    
    # Age categories
    X_new['age_category'] = pd.cut(X_new['age'], 
                                  bins=[0, 30, 45, 60, 100],
                                  labels=[0, 1, 2, 3])
    
    # Blood pressure categories
    X_new['bp_category'] = pd.cut(X_new['blood_pressure'], 
                                 bins=[0, 120, 140, 160, 300],
                                 labels=[0, 1, 2, 3])
    
    # Interaksi antara fitur
    X_new['bmi_age'] = X_new['bmi'] * X_new['age']
    X_new['bp_age'] = X_new['blood_pressure'] * X_new['age']
    X_new['bmi_bp'] = X_new['bmi'] * X_new['blood_pressure']
    
    return X_new

In [4]:
# Step 2, load newly created combinedd dataset (combined_dataset.csv)
def load_and_process_data(file_path='dataset/health_data1/combined_dataset.csv'):
    print("Loading and processing data...")
    
    # Load dataset
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    
    # Define features and targets
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    target_variables = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                       'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    print("Processing features...")
    df_processed = df.copy()
    
    # Handle missing values first
    print("Handling missing values...")
    for col in available_features:
        if col not in df_processed.columns:
            df_processed[col] = np.nan
    
    # Separate numeric and categorical features
    numeric_features = df_processed[available_features].select_dtypes(include=['float64', 'int64']).columns
    categorical_features = df_processed[available_features].select_dtypes(include=['object']).columns
    
    print(f"Found {len(numeric_features)} numeric features and {len(categorical_features)} categorical features")
    
    # Initialize imputers
    numeric_imputer = IterativeImputer(
        random_state=42,
        max_iter=100,
        sample_posterior=True
    )
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    # Impute numeric features
    print("Imputing numeric features...")
    df_processed[numeric_features] = numeric_imputer.fit_transform(df_processed[numeric_features])
    
    # Impute categorical features if any
    if len(categorical_features) > 0:
        print("Imputing categorical features...")
        df_processed[categorical_features] = categorical_imputer.fit_transform(df_processed[categorical_features])
    
    print("Adding engineered features...")
    # Add engineered features
    df_processed['bmi_category'] = pd.cut(
        df_processed['bmi'],
        bins=[float('-inf'), 18.5, 25, 30, float('inf')],
        labels=[0, 1, 2, 3]
    )
    
    df_processed['age_category'] = pd.cut(
        df_processed['age'],
        bins=[float('-inf'), 30, 45, 60, float('inf')],
        labels=[0, 1, 2, 3]
    )
    
    df_processed['bp_category'] = pd.cut(
        df_processed['bp'],
        bins=[float('-inf'), 120, 140, 160, float('inf')],
        labels=[0, 1, 2, 3]
    )
    
    # Create interaction features
    df_processed['bmi_age'] = df_processed['bmi'] * df_processed['age']
    df_processed['bp_age'] = df_processed['bp'] * df_processed['age']
    df_processed['bmi_bp'] = df_processed['bmi'] * df_processed['bp']
    
    # Normalize features
    print("Normalizing features...")
    scaler = StandardScaler()
    df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])
    
    # Prepare final features
    final_features = (
        available_features + 
        ['bmi_category', 'age_category', 'bp_category', 
         'bmi_age', 'bp_age', 'bmi_bp']
    )
    
    # Handle target variables
    print("Processing target variables...")
    df_processed[target_variables] = df[target_variables].fillna(0)
    
    print("Data processing completed!")
    return df_processed[final_features], df_processed[target_variables]

In [5]:
# Step 3, create the calculation for the derived data from main user input data
def calculate_derived_features(height, weight, gender, age, blood_pressure, cholesterol, blood_glucose):
    # bmi calculation
    height_m = height / 100
    bmi = weight / (height_m ** 2)
    
    # sodium calculation
    sodium = weight * 20
    
    # fat based on gender calculatonn
    fat = weight * (0.15 if gender == 1 else 0.25)
    
    # chols level calc
    cholesterol_level = (bmi * 2) + (age * 0.15) + (blood_pressure * 0.05) + (blood_glucose * 0.02) + 150
    
    # protein calc
    protein = weight * 0.9
    
    # carbo calc
    carbs = weight * 3
    
    return {
        'bmi': bmi,
        'sodium': sodium,
        'fat': fat,
        'cholesterol_level': cholesterol_level,
        'protein': protein,
        'carbs': carbs
    }

In [6]:
# Step 4, model training set + save
def train_disease_models(X, y):
    print("\n=== Starting Model Training Process ===")
    models = {}
    scaler = StandardScaler()
    
    print("1. Initializing StandardScaler...")
    X = np.array(X)
    X_scaled = scaler.fit_transform(X)
    print("   ✓ Data scaling completed")
    
    total_diseases = len(y.columns)
    print(f"\nTotal diseases to process: {total_diseases}")
    
    # Parameter configurations remain the same
    base_params = {
        'n_estimators': 500,
        'max_depth': 20,
        'min_samples_split': 4,
        'min_samples_leaf': 2,
        'random_state': 42,
        'class_weight': 'balanced_subsample',
        'n_jobs': -1
    }
    
    # Disease specific parameters remain the same
    disease_params = {
        'heart': {
            'n_estimators': 1000,
            'max_depth': 25,
            'min_samples_split': 3,
            'class_weight': 'balanced_subsample'
        },
        'hypertension': {
            'n_estimators': 800,
            'max_depth': 22,
            'min_samples_split': 3,
            'class_weight': 'balanced_subsample'
        },
        'diabetes': {
            'n_estimators': 800,
            'max_depth': 22,
            'min_samples_split': 3,
            'class_weight': 'balanced_subsample'
        },
        'cholesterol': {
            'n_estimators': 800,
            'max_depth': 22,
            'min_samples_split': 3,
            'class_weight': 'balanced_subsample'
        }
    }
    
    for idx, disease in enumerate(y.columns, 1):
        print(f"\n[{idx}/{total_diseases}] Processing {disease.upper()} model")
        start_time = time.time()
        
        print(f"   • Getting target values for {disease}")
        y_disease = y[disease].values
        
        if disease in ['heart', 'hypertension', 'diabetes', 'cholesterol']:
            print("   • Applying SMOTE + Tomek Links for balanced sampling")
            smote = SMOTE(random_state=42, sampling_strategy='auto')
            tomek = TomekLinks(sampling_strategy='auto')
            
            print("     - Running SMOTE oversampling...")
            X_res, y_res = smote.fit_resample(X_scaled, y_disease)
            print(f"     ✓ Data resampled from {len(y_disease)} to {len(y_res)} samples")
            
            print("     - Running Tomek Links undersampling...")
            X_res, y_res = tomek.fit_resample(X_res, y_res)
            print(f"     ✓ Final sample size: {len(y_res)}")
        else:
            X_res, y_res = X_scaled, y_disease
        
        print("   • Setting up model parameters")
        current_params = base_params.copy()
        if disease in disease_params:
            current_params.update(disease_params[disease])
            print("     ✓ Using disease-specific parameters")
        
        if disease in ['heart', 'hypertension', 'diabetes', 'cholesterol']:
            print("   • Running GridSearchCV for parameter optimization")
            param_grid = {
                'n_estimators': [current_params['n_estimators'], current_params['n_estimators']+200],
                'max_depth': [current_params['max_depth']-2, current_params['max_depth'], current_params['max_depth']+2],
                'min_samples_split': [2, 3, 4],
                'min_samples_leaf': [1, 2, 3]
            }
            
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            rf = RandomForestClassifier(random_state=42)
            grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='f1', n_jobs=-1)
            
            print("     - Training model with cross-validation...")
            grid_search.fit(X_res, y_res)
            model = grid_search.best_estimator_
            print(f"     ✓ Best parameters found: {grid_search.best_params_}")
        else:
            print("   • Training RandomForest model...")
            model = RandomForestClassifier(**current_params)
            model.fit(X_res, y_res)
        
        elapsed_time = time.time() - start_time
        print(f"   ✓ {disease.upper()} model completed in {elapsed_time:.2f} seconds")
        
        models[disease] = model
    
    print("\n=== Model Training Completed ===")
    print(f"Total time elapsed: {time.time() - start_time:.2f} seconds")
    return models, scaler

# def save_models(models, scaler, save_dir='models'):
#     if not os.path.exists(save_dir):
#         os.makedirs(save_dir)
    
#     for disease, model in models.items():
#         model_path = os.path.join(save_dir, f'{disease}_model.joblib')
#         joblib.dump(model, model_path)
    
#     scaler_path = os.path.join(save_dir, 'scaler.joblib')
#     joblib.dump(scaler, scaler_path)
#     print(f"Models and scaler saved in {save_dir}/")

def save_models(models, scaler, save_dir='models'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    models_path = os.path.join(save_dir, 'disease-prediction-model.joblib')
    joblib.dump(models, models_path)

    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print(f"Models saved in {models_path} and scaler saved in {scaler_path}/")


In [7]:
def evaluate_model_accuracy(models, X, y):
    print("\n=== Starting Model Evaluation ===")
    print("accuracy model eval:\n")
    print("{:<15} {:<10} {:<10} {:<10} {:<10}".format(
        "Disease", "Accurancy", "Precision", "Recall", "F1-Score"
    ))
    print("-" * 55)
    
    overall_metrics = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0
    }
    
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    total_models = len(models)
    print(f"Total models to evaluate: {total_models}")
    
    for idx, (disease, model) in enumerate(models.items(), 1):
        try:
            print(f"\n[{idx}/{total_models}] Evaluating {disease.upper()} model...")
            start_time = time.time()
            
            y_pred = model.predict(X)
            y_true = y[disease]
            
            accuracy = accuracy_score(y_true, y_pred)
            precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
            recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
            
            overall_metrics['accuracy'] += accuracy
            overall_metrics['precision'] += precision
            overall_metrics['recall'] += recall
            overall_metrics['f1'] += f1
            
            print("{:<15} {:<10.2f} {:<10.2f} {:<10.2f} {:<10.2f}".format(
                disease,
                accuracy * 100,
                precision * 100,
                recall * 100,
                f1 * 100
            ))
            
            elapsed_time = time.time() - start_time
            print(f"✓ Evaluation completed in {elapsed_time:.2f} seconds")
            
        except Exception as e:
            print(f"✗ Error evaluating {disease} model: {str(e)}")
    
    n_models = len(models)
    print("\n=== Final Results ===")
    print("Rata-rata Metrik:")
    print("Akurasi   : {:.2f}%".format(overall_metrics['accuracy'] / n_models * 100))
    print("Presisi   : {:.2f}%".format(overall_metrics['precision'] / n_models * 100))
    print("Recall    : {:.2f}%".format(overall_metrics['recall'] / n_models * 100))
    print("F1-Score  : {:.2f}%".format(overall_metrics['f1'] / n_models * 100))
    print("\n=== Evaluation Complete ===")

# Import tambahan yang diperlukan
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Cara penggunaan:
def main():
    # Load dan preprocess data
    X, y = load_and_process_data()
    
    # Split data dengan stratifikasi
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42,
        stratify=y['heart']  # Stratifikasi berdasarkan penyakit dengan imbalance terburuk
    )
    
    # Train model
    models, scaler = train_disease_models(X_train, y_train)
    
    # Evaluasi model
    print("\nHasil Evaluasi pada Data Testing:")
    evaluate_model_accuracy(models, X_test, y_test)

if __name__ == "__main__":
    main()

Loading and processing data...
Dataset loaded with shape: (160892, 22)
Processing features...
Handling missing values...
Found 9 numeric features and 0 categorical features
Imputing numeric features...
Adding engineered features...
Normalizing features...
Processing target variables...
Data processing completed!

=== Starting Model Training Process ===
1. Initializing StandardScaler...
   ✓ Data scaling completed

Total diseases to process: 10

[1/10] Processing ANEMIA model
   • Getting target values for anemia
   • Setting up model parameters
   • Training RandomForest model...
   ✓ ANEMIA model completed in 20.77 seconds

[2/10] Processing CHOLESTEROL model
   • Getting target values for cholesterol
   • Applying SMOTE + Tomek Links for balanced sampling
     - Running SMOTE oversampling...
     ✓ Data resampled from 128713 to 197584 samples
     - Running Tomek Links undersampling...
     ✓ Final sample size: 197036
   • Setting up model parameters
     ✓ Using disease-specific par

KeyboardInterrupt: 

In [7]:
# Step 5 load model
def load_models(save_dir='models'):
    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Directory {save_dir} not found. Please train the models first.")
    
    models = {}
    for model_file in os.listdir(save_dir):
        if model_file.endswith('_model.joblib'):
            disease = model_file.replace('_model.joblib', '')
            model_path = os.path.join(save_dir, model_file)
            models[disease] = joblib.load(model_path)
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    if not os.path.exists(scaler_path):
        raise FileNotFoundError("Scaler file not found. Please train the models first.")
    scaler = joblib.load(scaler_path)
    
    print(f"Models and scaler loaded from {save_dir}/")
    return models, scaler

In [8]:
# Step 6, predictin the data
def predict_health_risks(user_input, models, scaler):
    derived = calculate_derived_features(
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose']
    )
    
    features = pd.DataFrame([[
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose'],
        derived['bmi'],
        derived['sodium']
    ]])
    
    features = features.fillna(method='ffill').fillna(method='bfill')
    
    features_scaled = scaler.transform(features)
    
    predictions = {}
    for disease, model in models.items():
        try:
            proba = model.predict_proba(features_scaled)[0]
            if len(proba) > 1:
                prob = proba[1]
            else:
                prob = model.predict(features_scaled)[0]
        except:
            prob = model.predict(features_scaled)[0]
        
        predictions[disease] = float(prob) * 100
    
    return predictions, derived

In [9]:
# Step 7, here we can input the user data
def get_average_values(df):
    return {
        'height': df['height'].mean(),
        'weight': df['weight'].mean(),
        'gender': round(df['gender'].mean()),
        'age': df['age'].mean(),
        'blood_pressure': df['bp'].mean(),
        'cholesterol': df['bc'].mean(),
        'blood_glucose': df['bg'].mean()
    }

def main():
    df = pd.read_csv('dataset/health_data1/combined_dataset.csv')
    avg_values = get_average_values(df)
    
    X, y = load_and_process_data()
    models, scaler = train_disease_models(X, y)
    
    save_models(models, scaler)
    
    # default param value is "None"
    user_input = {
        'height': 160,
        'weight': 60,
        'gender': 1, # 1=male, 0=female
        'age': 20,
        'blood_pressure': None,
        'cholesterol': None,
        'blood_glucose': None
    }
    
    for key in user_input:
        if user_input[key] is None:
            if key == 'blood_pressure':
                user_input[key] = avg_values['blood_pressure']
            elif key == 'blood_glucose':
                user_input[key] = avg_values['blood_glucose']
            elif key == 'cholesterol':
                user_input[key] = avg_values['cholesterol']
            else:
                user_input[key] = avg_values[key]
            print(f"Using average value for {key}: {user_input[key]:.2f}")
    
    predictions, derived_features = predict_health_risks(user_input, models, scaler)
    
    print("\nDerived Features:")
    for feature, value in derived_features.items():
        print(f"{feature}: {value:.2f}")
    
    print("\nDisease Risk Predictions:")
    for disease, risk in predictions.items():
        print(f"{disease}: {risk:.2f}%")

if __name__ == "__main__":
    main()

Models saved in models\disease-prediction-model.joblib and scaler saved in models\scaler.joblib/
Using average value for blood_pressure: 130.83
Using average value for cholesterol: 98.87
Using average value for blood_glucose: 105.84

Derived Features:
bmi: 23.44
sodium: 1200.00
fat: 9.00
cholesterol_level: 208.53
protein: 54.00
carbs: 180.00

Disease Risk Predictions:
anemia: 0.00%
cholesterol: 9.93%
ckd: 10.35%
diabetes: 2.23%
heart: 34.67%
hypertension: 5.31%
ms: 7.88%
nafld: 1.40%
obesity: 0.00%
stroke: 0.42%


In [10]:
# Step 8, convert saved model from joblib to hdf5
# def convert_all_joblib_to_hdf5(models_dir='models'):
#     for model_file in os.listdir(models_dir):
#         if model_file.endswith('_model.joblib'):
#             joblib_model_path = os.path.join(models_dir, model_file)
#             hdf5_model_path = os.path.join(models_dir, model_file.replace('.joblib', '.hdf5'))
            
#             model = joblib.load(joblib_model_path)
            
#             model_params = {
#                 'n_estimators': getattr(model, 'n_estimators', None),
#                 'max_depth': getattr(model, 'max_depth', None),
#                 'min_samples_split': getattr(model, 'min_samples_split', None),
#                 'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
#                 'feature_importances': getattr(model, 'feature_importances_', None)
#             }
            
#             with h5py.File(hdf5_model_path, 'w') as hdf:
#                 for key, value in model_params.items():
#                     if value is not None and (not isinstance(value, (list, np.ndarray)) or len(value) > 0):
#                         hdf.create_dataset(key, data=value)
#                     else:
#                         print(f"Warning: Parameter '{key}' is empty or None for model '{model_file}'. Skipping this parameter.")
            
#             print(f"Model '{model_file}' converted and saved to {hdf5_model_path}")

# def convert_all_joblib_to_json(models_dir='models'):
#     for model_file in os.listdir(models_dir):
#         if model_file.endswith('_model.joblib'):
#             joblib_model_path = os.path.join(models_dir, model_file)
#             json_model_path = os.path.join(models_dir, model_file.replace('.joblib', '.json'))
            
#             model = joblib.load(joblib_model_path)
            
#             model_params = {
#                 'n_estimators': getattr(model, 'n_estimators', None),
#                 'max_depth': getattr(model, 'max_depth', None),
#                 'min_samples_split': getattr(model, 'min_samples_split', None),
#                 'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
#                 'feature_importances': getattr(model, 'feature_importances_', None)
#             }
            
#             for key, value in model_params.items():
#                 if isinstance(value, (np.ndarray)):
#                     model_params[key] = value.tolist()
            
#             with open(json_model_path, 'w') as json_file:
#                 json.dump(model_params, json_file, indent=4)
            
#             print(f"Model '{model_file}' converted and saved to {json_model_path}")

# convert_all_joblib_to_hdf5()
# convert_all_joblib_to_json()

def convert_model_to_hdf5_n_json(models_dir='models'):
    models_path = os.path.join(models_dir, 'disease-prediction-model.joblib')
    hdf5_model_path = os.path.join(models_dir, 'disease-prediction-model.hdf5')
    json_model_path = os.path.join(models_dir, 'disease-prediction-model.json')

    models = joblib.load(models_path)

    with h5py.File(hdf5_model_path, 'w') as hdf:
        model_params_dict = {}
        for disease, model in models.items():
            model_params = {
                'n_estimators': getattr(model, 'n_estimators', None),
                'max_depth': getattr(model, 'max_depth', None),
                'min_samples_split': getattr(model, 'min_samples_split', None),
                'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
                'feature_importances': getattr(model, 'feature_importances_', None)
            }
            model_params_dict[disease] = {key: value.tolist() if isinstance(value, np.ndarray) else value for key, value in model_params.items()}

            for key, value in model_params.items():
                if value is not None and (not isinstance(value, (list, np.ndarray)) or len(value) > 0):
                    hdf.create_dataset(f"{disease}/{key}", data=value)
                else:
                    print(f"Warning: Parameter '{key}' is empty or None for model '{disease}'. Skipping this parameter.")

    with open(json_model_path, 'w') as json_file:
        json.dump(model_params_dict, json_file, indent=4)

    print(f"Model converted and saved to {hdf5_model_path} and parameters saved to {json_model_path}")

convert_model_to_hdf5_n_json()

Model converted and saved to models\disease-prediction-model.hdf5 and parameters saved to models\disease-prediction-model.json


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import os
import json
import joblib
import time

In [3]:
def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

In [4]:
def load_and_process_data(file_path='dataset/health_data1/combined_dataset.csv'):
    print("Loading and processing data...")
    
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    target_variables = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                       'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    df_processed = df.copy()
    
    print("Processing features...")
    numeric_features = [col for col in available_features if df[col].dtype in ['int64', 'float64']]
    categorical_features = [col for col in available_features if col not in numeric_features]
    
    print("Handling missing values...")
    print(f"Found {len(numeric_features)} numeric features and {len(categorical_features)} categorical features")
    
    numeric_imputer = IterativeImputer(random_state=42, max_iter=100, sample_posterior=True)
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    df_processed[numeric_features] = numeric_imputer.fit_transform(df_processed[numeric_features])
    if categorical_features:
        df_processed[categorical_features] = categorical_imputer.fit_transform(df_processed[categorical_features])
    
    scaler = StandardScaler()
    df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])
    
    df_processed = add_engineered_features(df_processed)
    
    final_features = (
        available_features + 
        ['bmi_category', 'age_category', 'bp_category', 
         'bmi_age', 'bp_age', 'bmi_bp']
    )
    
    print("Processing target variables...")
    df_processed[target_variables] = df[target_variables].fillna(0)
    
    print("Data processing completed!")
    return df_processed[final_features], df_processed[target_variables]

def add_engineered_features(X):
    X_new = X.copy()
    
    X_new['bmi_category'] = pd.cut(X_new['bmi'], 
                                  bins=[float('-inf'), 18.5, 25, 30, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['age_category'] = pd.cut(X_new['age'], 
                                  bins=[float('-inf'), 30, 45, 60, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['bp_category'] = pd.cut(X_new['bp'], 
                                 bins=[float('-inf'), 120, 140, 160, float('inf')],
                                 labels=[0, 1, 2, 3])
    
    X_new['bmi_age'] = X_new['bmi'] * X_new['age']
    X_new['bp_age'] = X_new['bp'] * X_new['age']
    X_new['bmi_bp'] = X_new['bmi'] * X_new['bp']
    
    return X_new

In [5]:
def create_model(input_shape, name=None):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name=name)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', 
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                tf.keras.metrics.AUC()]
    )
    return model

def train_disease_models(X, y):
    print("\n=== Starting Model Training Process ===")
    models = {}
    scaler = StandardScaler()
    
    X = np.array(X)
    X_scaled = scaler.fit_transform(X)
    
    # GPU Configuration
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    
    # Optimizer settings
    tf.config.optimizer.set_jit(True)
    
    BATCH_SIZE = 2560
    BUFFER_SIZE = 10000
    
    # Split data terlebih dahulu
    X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)
    y_train, y_val = train_test_split(y.values, test_size=0.2, random_state=42)
    
    # Create datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))\
        .cache()\
        .shuffle(BUFFER_SIZE)\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
        
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))\
        .cache()\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
    
    # Create model
    combined_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(X_scaled.shape[1],)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(len(y.columns), activation='sigmoid')
    ])
    
    combined_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(name='auc')]
    )
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            verbose=1
        ),
        tf.keras.callbacks.TensorBoard(
            log_dir='./logs',
            histogram_freq=1
        )
    ]
    
    # Training
    print("\nStarting combined model training...")
    history = combined_model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=50,
        callbacks=callbacks,
        verbose=1
    )
    
    return combined_model, scaler, history

def evaluate_model_accuracy(models, X, y):
    print("\n=== Starting Model Evaluation ===")
    print("Model Evaluation Results:\n")
    print("{:<15} {:<10} {:<10} {:<10} {:<10}".format(
        "Disease", "Accuracy", "Precision", "Recall", "AUC"
    ))
    print("-" * 55)
    
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    total_models = len(models)
    overall_metrics = {'accuracy': 0, 'precision': 0, 'recall': 0, 'auc': 0}
    
    for idx, (disease, model) in enumerate(models.items(), 1):
        try:
            print(f"\n[{idx}/{total_models}] Evaluating {disease.upper()} model...")
            start_time = time.time()
            
            metrics = model.evaluate(X, y[disease].values, verbose=0)
            
            print("{:<15} {:<10.2f} {:<10.2f} {:<10.2f} {:<10.2f}".format(
                disease,
                metrics[1] * 100,#acc
                metrics[2] * 100,#prec
                metrics[3] * 100,#recall
                metrics[4] * 100 #auc
            ))
            
            overall_metrics['accuracy'] += metrics[1]
            overall_metrics['precision'] += metrics[2]
            overall_metrics['recall'] += metrics[3]
            overall_metrics['auc'] += metrics[4]
            
            elapsed_time = time.time() - start_time
            print(f"✓ Evaluation completed in {elapsed_time:.2f} seconds")
            
        except Exception as e:
            print(f"✗ Error evaluating {disease} model: {str(e)}")
    
    print("\n=== Final Results ===")
    for metric, value in overall_metrics.items():
        print(f"{metric.capitalize():10}: {value/total_models*100:.2f}%")
    print("\n=== Evaluation Complete ===")

In [6]:
def save_models(model, scaler, save_dir='models'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    print("\nSaving model...")
    
    # Save combined model in HDF5 format
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model.save(model_path)
    print("✓ Saved combined model in HDF5 format")
    
    # Save scaler
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print("✓ Saved scaler")
    
    print("\nAll models and scaler saved successfully!")

In [7]:
def load_models(save_dir='models'):
    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Directory {save_dir} not found")
    
    print("\nLoading model...")
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model = tf.keras.models.load_model(model_path)
    print("✓ Loaded combined model from HDF5")
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    scaler = joblib.load(scaler_path)
    print("✓ Loaded scaler")
    
    return model, scaler

In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import os
import json
import joblib
import time

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def load_and_process_data(file_path='dataset/health_data1/combined_dataset.csv'):
    print("Loading and processing data...")
    
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    target_variables = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                       'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    df_processed = df.copy()
    
    print("Processing features...")
    numeric_features = [col for col in available_features if df[col].dtype in ['int64', 'float64']]
    categorical_features = [col for col in available_features if col not in numeric_features]
    
    print("Handling missing values...")
    print(f"Found {len(numeric_features)} numeric features and {len(categorical_features)} categorical features")
    
    numeric_imputer = IterativeImputer(random_state=42, max_iter=100, sample_posterior=True)
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    df_processed[numeric_features] = numeric_imputer.fit_transform(df_processed[numeric_features])
    if categorical_features:
        df_processed[categorical_features] = categorical_imputer.fit_transform(df_processed[categorical_features])
    
    scaler = StandardScaler()
    df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])
    
    df_processed = add_engineered_features(df_processed)
    
    final_features = (
        available_features + 
        ['bmi_category', 'age_category', 'bp_category', 
         'bmi_age', 'bp_age', 'bmi_bp']
    )
    
    print("Processing target variables...")
    df_processed[target_variables] = df[target_variables].fillna(0)
    
    print("Data processing completed!")
    return df_processed[final_features], df_processed[target_variables]

def add_engineered_features(X):
    X_new = X.copy()
    
    X_new['bmi_category'] = pd.cut(X_new['bmi'], 
                                  bins=[float('-inf'), 18.5, 25, 30, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['age_category'] = pd.cut(X_new['age'], 
                                  bins=[float('-inf'), 30, 45, 60, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['bp_category'] = pd.cut(X_new['bp'], 
                                 bins=[float('-inf'), 120, 140, 160, float('inf')],
                                 labels=[0, 1, 2, 3])
    
    X_new['bmi_age'] = X_new['bmi'] * X_new['age']
    X_new['bp_age'] = X_new['bp'] * X_new['age']
    X_new['bmi_bp'] = X_new['bmi'] * X_new['bp']
    
    return X_new

def create_model(input_shape, name=None):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ], name=name)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', 
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                tf.keras.metrics.AUC()]
    )
    return model

def train_disease_models(X, y):
    print("\n=== Starting Model Training Process ===")
    models = {}
    scaler = StandardScaler()
    
    X = np.array(X)
    X_scaled = scaler.fit_transform(X)
    
    # GPU Configuration
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    
    # Optimizer settings
    tf.config.optimizer.set_jit(True)
    
    BATCH_SIZE = 2560
    BUFFER_SIZE = 10000
    
    # Split data terlebih dahulu
    X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)
    y_train, y_val = train_test_split(y.values, test_size=0.2, random_state=42)
    
    # Create datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))\
        .cache()\
        .shuffle(BUFFER_SIZE)\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
        
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))\
        .cache()\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
    
    # Create model
    combined_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(X_scaled.shape[1],)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(len(y.columns), activation='sigmoid')
    ])
    
    combined_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(name='auc')]
    )
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            verbose=1
        ),
        tf.keras.callbacks.TensorBoard(
            log_dir='./logs',
            histogram_freq=1
        )
    ]
    
    # Training
    print("\nStarting combined model training...")
    history = combined_model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=50,
        callbacks=callbacks,
        verbose=1
    )
    
    return combined_model, scaler, history

def evaluate_model_accuracy(models, X, y):
    print("\n=== Starting Model Evaluation ===")
    print("Model Evaluation Results:\n")
    print("{:<15} {:<10} {:<10} {:<10} {:<10}".format(
        "Disease", "Accuracy", "Precision", "Recall", "AUC"
    ))
    print("-" * 55)
    
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    total_models = len(models)
    overall_metrics = {'accuracy': 0, 'precision': 0, 'recall': 0, 'auc': 0}
    
    for idx, (disease, model) in enumerate(models.items(), 1):
        try:
            print(f"\n[{idx}/{total_models}] Evaluating {disease.upper()} model...")
            start_time = time.time()
            
            metrics = model.evaluate(X, y[disease].values, verbose=0)
            
            print("{:<15} {:<10.2f} {:<10.2f} {:<10.2f} {:<10.2f}".format(
                disease,
                metrics[1] * 100,#acc
                metrics[2] * 100,#prec
                metrics[3] * 100,#recall
                metrics[4] * 100 #auc
            ))
            
            overall_metrics['accuracy'] += metrics[1]
            overall_metrics['precision'] += metrics[2]
            overall_metrics['recall'] += metrics[3]
            overall_metrics['auc'] += metrics[4]
            
            elapsed_time = time.time() - start_time
            print(f"✓ Evaluation completed in {elapsed_time:.2f} seconds")
            
        except Exception as e:
            print(f"✗ Error evaluating {disease} model: {str(e)}")
    
    print("\n=== Final Results ===")
    for metric, value in overall_metrics.items():
        print(f"{metric.capitalize():10}: {value/total_models*100:.2f}%")
    print("\n=== Evaluation Complete ===")

def save_models(model, scaler, save_dir='models'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    print("\nSaving model...")
    
    # Save combined model in HDF5 format
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model.save(model_path)
    print("✓ Saved combined model in HDF5 format")
    
    # Save scaler
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print("✓ Saved scaler")
    
    print("\nAll models and scaler saved successfully!")

def load_models(save_dir='models'):
    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Directory {save_dir} not found")
    
    print("\nLoading model...")
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model = tf.keras.models.load_model(model_path)
    print("✓ Loaded combined model from HDF5")
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    scaler = joblib.load(scaler_path)
    print("✓ Loaded scaler")
    
    return model, scaler

def predict_diseases(input_data, models, scaler):
    print("\n--- Predicting Diseases ---")
    
    # Ensure input_data is a DataFrame
    if not isinstance(input_data, pd.DataFrame):
        input_data = pd.DataFrame([input_data])
    
    # Scale input data
    X_scaled = scaler.transform(input_data)
    
    predictions = {}
    for disease, model in models.items():
        pred_prob = model.predict(X_scaled, verbose=0)[0][0]
        predictions[disease] = {
            'probability': float(pred_prob),
            'prediction': 1 if pred_prob >= 0.5 else 0
        }
    
    return predictions

def predict_disease_risks(user_input, model, scaler):
    # Ensure input_data is a DataFrame
    if not isinstance(user_input, pd.DataFrame):
        user_input = pd.DataFrame([user_input])
    
    # Scale input data
    X_scaled = scaler.transform(user_input)
    
    # Predict probabilities for each disease
    predictions = model.predict(X_scaled, verbose=0)[0]
    
    # Convert predictions to percentage
    predictions_percent = {f'disease_{i+1}': prob * 100 for i, prob in enumerate(predictions)}
    
    return predictions_percent

def get_average_values(df):
    return {
        'height': df['height'].mean(),
        'weight': df['weight'].mean(),
        'gender': round(df['gender'].mean()),
        'age': df['age'].mean(),
        'blood_pressure': df['bp'].mean(),
        'cholesterol': df['bc'].mean(),
        'blood_glucose': df['bg'].mean()
    }

def main():
    # Load the dataset and calculate average values
    df = pd.read_csv('dataset/health_data1/combined_dataset.csv')
    avg_values = get_average_values(df)
    
    # Load and preprocess data
    X, y = load_and_process_data()
    models, scaler = train_disease_models(X, y)
    
    save_models(models, scaler)
    
    # Default user input with None values
    user_input = {
        'height': 160,
        'weight': 60,
        'gender': 1,  # 1=male, 0=female
        'age': 20,
        'blood_pressure': None,
        'cholesterol': None,
        'blood_glucose': None
    }
    
    # Fill missing user input values with averages
    for key in user_input:
        if user_input[key] is None:
            user_input[key] = avg_values[key]
            print(f"Using average value for {key}: {user_input[key]:.2f}")
    
    # Add derived features to user input
    derived_features = calculate_derived_features(
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose']
    )
    
    # Combine user input and derived features
    user_input.update(derived_features)
    
    # Ensure the input data is a DataFrame with the correct columns
    input_df = pd.DataFrame([user_input])
    
    # Predict health risks
    predictions = predict_disease_risks(input_df, models, scaler)
    
    # Display derived features and predictions
    print("\nDerived Features:")
    for feature, value in derived_features.items():
        print(f"{feature}: {value:.2f}")
    
    print("\nDisease Risk Predictions:")
    for disease, risk in predictions.items():
        print(f"{disease}: {risk:.2f}%")

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/health_data1/combined_dataset.csv'