In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import os
import json
import joblib
import time

# Step 0, chek dataset availability

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

folder_path = 'dataset/health_data1/'

try:
    print("File inside the dataset folder:", os.listdir(folder_path))
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"Missing files: {', '.join(missing_files)}")
    else:
        print("All files are available.")

except FileNotFoundError:
    print(f"Folder missing: {folder_path}")


Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- combined_dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  

  return pd.read_csv(file_path)


Successfully loaded health_data1_combined.csv
First few rows:
  gender  hemoglobin  age  blood_pressure  cholesterol  glucose  bmi  height  \
0    1.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
1    0.0        15.9  NaN             NaN          NaN      NaN  NaN     NaN   
2    0.0         9.0  NaN             NaN          NaN      NaN  NaN     NaN   
3    0.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
4    1.0        14.7  NaN             NaN          NaN      NaN  NaN     NaN   

   weight  HDL  Height  Weight  
0     NaN  NaN     NaN     NaN  
1     NaN  NaN     NaN     NaN  
2     NaN  NaN     NaN     NaN  
3     NaN  NaN     NaN     NaN  
4     NaN  NaN     NaN     NaN  

Processing: heart-disease-dataset.csv
Successfully loaded heart-disease-dataset.csv
First few rows:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2

In [2]:
# preprocess available data and merge it in the end
columns = [
    "height", "weight", "gender", "age", "bp", "bc", "bg", "bmi", "sodium", 
    "fat", "protein", "carbs", "anemia", "cholesterol", "ckd", "diabetes", 
    "heart", "hypertension", "ms", "nafld", "obesity", "stroke"
]

def create_data_dict(**kwargs):
    base_dict = {
        "height": np.nan, "weight": np.nan, "gender": np.nan, "age": np.nan,
        "bp": np.nan, "bc": np.nan, "bg": np.nan, "bmi": np.nan,
        "sodium": np.nan, "fat": np.nan, "protein": np.nan, "carbs": np.nan,
        "anemia": 0, "cholesterol": 0, "ckd": 0, "diabetes": 0,
        "heart": 0, "hypertension": 0, "ms": 0, "nafld": 0, "obesity": 0, "stroke": 0
    }
    base_dict.update({k: v for k, v in kwargs.items() if v is not None})
    return base_dict

all_data = []

# 1. Anemia dataset
anemia_data = pd.read_csv(os.path.join(folder_path, "anemia-dataset.csv"))
all_data.extend([
    create_data_dict(
        gender=1 if row["Gender"] == "Male" else 0,
        bg=round(row["Hemoglobin"] * 7, 1),
        anemia=row["Result"]
    )
    for _, row in anemia_data.iterrows()
])

# 2. Cholesterol dataset
chol_data = pd.read_csv(os.path.join(folder_path, "cholesterol-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["sex"],
        bp=row["trestbps"],
        bc=row["chol"],
        bg=120 if row["fbs"] == 1 else 100,
        cholesterol=1 if row["chol"] > 240 else 0
    )
    for _, row in chol_data.iterrows()
])

# 3. Chronic Kidney Disease dataset
ckd_data = pd.read_csv(os.path.join(folder_path, "chronic-kidney-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        bp=row["bp"],
        bg=row["bgr"],
        sodium=row["sod"],
        anemia=1 if row["ane"] == "yes" else 0,
        ckd=1 if row["classification"] == "ckd" else 0,
        diabetes=1 if row["dm"] == "yes" else 0,
        heart=1 if row["cad"] == "yes" else 0,
        hypertension=1 if row["htn"] == "yes" else 0
    )
    for _, row in ckd_data.iterrows()
])

# 4. Diabetes dataset
diabetes_data = pd.read_csv(os.path.join(folder_path, "diabetes-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bmi=row["BMI"],
        cholesterol=row["HighChol"],
        diabetes=1 if row["Diabetes"] == 1 else 0,
        hypertension=row["HighBP"]
    )
    for _, row in diabetes_data.iterrows()
])

# 5. Heart Disease dataset
heart_data = pd.read_csv(os.path.join(folder_path, "heart-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        bg=120 if row["fbs"] == 1 else 100,
        heart=1 if row["target"] == 1 else 0
    )
    for _, row in heart_data.iterrows()
])

# 6. Hypertension dataset
hypertension_data = pd.read_csv(os.path.join(folder_path, "hypertension-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        bg=row["fbs"],
        hypertension=1 if row["target"] == 1 else 0
    )
    for _, row in hypertension_data.iterrows()
])

# 7. Metabolic Syndrome dataset
ms_data = pd.read_csv(os.path.join(folder_path, "metabolic-syndrome-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bg=row["BloodGlucose"],
        bmi=row["BMI"],
        ms=1 if row["MetabolicSyndrome"] == 1 else 0
    )
    for _, row in ms_data.iterrows()
])

# 8. NAFLD dataset
nafld_data = pd.read_csv(os.path.join(folder_path, "nafld1-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["male"],
        weight=row["weight"],
        height=row["height"],
        bmi=round(row["bmi"],1),
        nafld=row["status"]
    )
    for _, row in nafld_data.iterrows()
])

# 9. Obesity dataset
obesity_data = pd.read_csv(os.path.join(folder_path, "obesity-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Gender"] == "Male" else 0,
        weight=row["Weight"],
        height=row["Height"],
        bmi=row["BMI"],
        obesity=1 if row["Label"] == "Obesity" else 0
    )
    for _, row in obesity_data.iterrows()
])

# 10. Stroke dataset
stroke_data = pd.read_csv(os.path.join(folder_path, "stroke-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == "Male" else 0,
        bc=row["heart_disease"],
        bmi=row["bmi"],
        heart=row["heart_disease"],
        hypertension=row["hypertension"],
        stroke=row["stroke"]
    )
    for _, row in stroke_data.iterrows()
])

combined_data = pd.DataFrame(all_data)

output_path = os.path.join(folder_path, "combined_dataset.csv")
combined_data.to_csv(output_path, index=False)

print(f"Combined dataset succesfully savd on {output_path}")
print("\ncombined dataset inform:")
print(combined_data.info())
print("\ncombined dataset stats:")
print(combined_data.describe())

Combined dataset succesfully savd on dataset/health_data1/combined_dataset.csv

combined dataset inform:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160892 entries, 0 to 160891
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   height        14489 non-null   float64
 1   weight        12871 non-null   float64
 2   gender        160492 non-null  float64
 3   age           159462 non-null  float64
 4   bp            27799 non-null   float64
 5   bc            68321 non-null   float64
 6   bg            31589 non-null   float64
 7   bmi           126673 non-null  float64
 8   sodium        313 non-null     float64
 9   fat           0 non-null       float64
 10  protein       0 non-null       float64
 11  carbs         0 non-null       float64
 12  anemia        160892 non-null  float64
 13  cholesterol   160892 non-null  float64
 14  ckd           160892 non-null  int64  
 15  diabetes      160892 non-null  

In [2]:
def load_and_process_data(file_path='dataset/health_data1/combined_dataset.csv'):
    print("Loading and processing data...")
    
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    target_variables = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                        'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    df_processed = df.copy()
    
    print("Processing features...")
    numeric_features = [col for col in available_features if pd.api.types.is_numeric_dtype(df[col])]
    categorical_features = [col for col in available_features if col not in numeric_features]
    
    print("Handling missing values...")
    print(f"Found {len(numeric_features)} numeric features and {len(categorical_features)} categorical features")
    
    numeric_imputer = IterativeImputer(random_state=42, max_iter=100, sample_posterior=True)
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    df_processed[numeric_features] = numeric_imputer.fit_transform(df_processed[numeric_features])
    if categorical_features:
        df_processed[categorical_features] = categorical_imputer.fit_transform(df_processed[categorical_features])
    
    scaler = StandardScaler()
    df_processed[numeric_features] = scaler.fit_transform(df_processed[numeric_features])
    
    df_processed = add_engineered_features(df_processed)
    
    final_features = available_features + [
        'bmi_category', 'age_category', 'bp_category',
        'bmi_age', 'bp_age', 'bmi_bp'
    ]
    
    print("Processing target variables...")
    df_processed[target_variables] = df[target_variables].fillna(0)
    
    print("Data processing completed!")
    return df_processed[final_features], df_processed[target_variables]

def add_engineered_features(X):
    X_new = X.copy()
    X_new['bmi_category'] = pd.cut(X_new['bmi'], 
                                  bins=[float('-inf'), 18.5, 25, 30, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['age_category'] = pd.cut(X_new['age'], 
                                  bins=[float('-inf'), 30, 45, 60, float('inf')],
                                  labels=[0, 1, 2, 3])
    
    X_new['bp_category'] = pd.cut(X_new['bp'], 
                                 bins=[float('-inf'), 120, 140, 160, float('inf')],
                                 labels=[0, 1, 2, 3])
    
    X_new['bmi_age'] = X_new['bmi'] * X_new['age']
    X_new['bp_age'] = X_new['bp'] * X_new['age']
    X_new['bmi_bp'] = X_new['bmi'] * X_new['bp']
    return X_new


def train_disease_models(X, y):
    print("\n=== Starting Model Training Process ===")
    models = {}
    scaler = StandardScaler()
    
    X = np.array(X)
    X_scaled = scaler.fit_transform(X)
    
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("GPU memory growth enabled")
        except RuntimeError as e:
            print(e)
    
    tf.config.optimizer.set_jit(True)
    
    BATCH_SIZE = 2560
    BUFFER_SIZE = 10000
    
    X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)
    y_train, y_val = train_test_split(y.values, test_size=0.2, random_state=42)
    
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))\
        .cache()\
        .shuffle(BUFFER_SIZE)\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
        
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))\
        .cache()\
        .batch(BATCH_SIZE)\
        .prefetch(tf.data.AUTOTUNE)
    
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    
    final_features = available_features + [
        'bmi_category', 'age_category', 'bp_category',
        'bmi_age', 'bp_age', 'bmi_bp'
    ]
    
    combined_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(len(final_features),)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(len(y.columns), activation='sigmoid')
    ])
    
    combined_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tf.keras.metrics.AUC(name='auc')]
    )
    
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=3,
            verbose=1
        )
    ]
    
    print("\nStarting combined model training...")
    history = combined_model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=100,
        callbacks=callbacks,
        verbose=1
    )
    
    return combined_model, scaler, history

def evaluate_model_performance(model, X_test, y_test, scaler):
    """
    Evaluates model performance using various metrics
    """
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    print("\n=== Model Performance Evaluation ===")
    
    X_test_scaled = scaler.transform(X_test)
    
    y_pred = model.predict(X_test_scaled)
    y_pred_binary = (y_pred > 0.5).astype(int)
    
    disease_names = ['Anemia', 'Cholesterol', 'CKD', 'Diabetes', 'Heart Disease',
                    'Hypertension', 'Metabolic Syndrome', 'NAFLD', 'Obesity', 'Stroke']
    
    print("\nDetailed Performance Metrics:")
    print("-" * 80)
    print(f"{'Disease':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("-" * 80)
    
    overall_metrics = {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}
    
    for i, disease in enumerate(disease_names):
        try:
            accuracy = accuracy_score(y_test.iloc[:, i], y_pred_binary[:, i])
            precision = precision_score(y_test.iloc[:, i], y_pred_binary[:, i], zero_division=0)
            recall = recall_score(y_test.iloc[:, i], y_pred_binary[:, i], zero_division=0)
            f1 = f1_score(y_test.iloc[:, i], y_pred_binary[:, i], zero_division=0)
            
            overall_metrics['accuracy'] += accuracy
            overall_metrics['precision'] += precision
            overall_metrics['recall'] += recall
            overall_metrics['f1'] += f1
            
            print(f"{disease:<20} {accuracy*100:>9.2f}% {precision*100:>9.2f}% {recall*100:>9.2f}% {f1*100:>9.2f}%")
            
        except Exception as e:
            print(f"Warning: Error evaluating {disease}: {str(e)}")
            continue
    
    n_diseases = len(disease_names)
    print("\nAverage Metrics:")
    print("-" * 40)
    for metric, value in overall_metrics.items():
        print(f"{metric.capitalize():<15}: {value/n_diseases*100:.2f}%")
    
    return overall_metrics

def save_models(model, scaler, save_dir='models'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    print("\nSaving model...")
    
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model.save(model_path)
    print("✓ Saved combined model in HDF5 format")
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print("✓ Saved scaler")
    
    print("\nAll models and scaler saved successfully!")

def load_models(save_dir='models'):
    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Directory {save_dir} not found")
    
    print("\nLoading model...")
    model_path = os.path.join(save_dir, 'disease-prediction-tf-model.h5')
    model = tf.keras.models.load_model(model_path)
    print("✓ Loaded combined model from HDF5")
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    scaler = joblib.load(scaler_path)
    print("✓ Loaded scaler")
    
    return model, scaler

def predict_diseases(input_data, models, scaler):
    print("\n--- Predicting Diseases ---")
    
    if not isinstance(input_data, pd.DataFrame):
        input_data = pd.DataFrame([input_data])
    
    X_scaled = scaler.transform(input_data)
    
    predictions = {}
    for disease, model in models.items():
        pred_prob = model.predict(X_scaled, verbose=0)[0][0]
        predictions[disease] = {
            'probability': float(pred_prob),
            'prediction': 1 if pred_prob >= 0.5 else 0
        }
    
    return predictions

def predict_disease_risks(user_input, combined_model, prediction_scaler):
    expected_features = [
        'height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium',
        'bmi_category', 'age_category', 'bp_category',
        'bmi_age', 'bp_age', 'bmi_bp'
    ]
    
    if isinstance(user_input, pd.DataFrame):
        for feature in expected_features:
            if feature not in user_input.columns:
                user_input[feature] = 0
        
        user_input = user_input[expected_features]
    
    X_scaled = prediction_scaler.transform(user_input)
    
    predictions = combined_model.predict(X_scaled, verbose=0)[0]
    
    disease_names = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                    'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    predictions_percent = {disease: prob * 100 
                         for disease, prob in zip(disease_names, predictions)}
    
    return predictions_percent

def calculate_derived_features(height, weight, gender, age, blood_pressure, cholesterol, blood_glucose):
    # bmi calculation
    height_m = height / 100
    bmi = weight / (height_m ** 2)
    
    # sodium calculation
    sodium = weight * 20
    
    # fat based on gender calculatonn
    fat = weight * (0.15 if gender == 1 else 0.25)
    
    # chols level calc
    cholesterol_level = (bmi * 2) + (age * 0.15) + (blood_pressure * 0.05) + (blood_glucose * 0.02) + 150
    
    # protein calc
    protein = weight * 0.9
    
    # carbo calc
    carbs = weight * 3
    
    return {
        'bmi': bmi,
        'sodium': sodium,
        'fat': fat,
        'cholesterol_level': cholesterol_level,
        'protein': protein,
        'carbs': carbs
    }

def get_average_values(df):
    return {
        'height': df['height'].mean(),
        'weight': df['weight'].mean(),
        'gender': round(df['gender'].mean()),
        'age': df['age'].mean(),
        'blood_pressure': df['bp'].mean(),
        'cholesterol': df['bc'].mean(),
        'blood_glucose': df['bg'].mean()
    }

def main():
    
    df = pd.read_csv('dataset/health_data1/combined_dataset.csv')
    avg_values = get_average_values(df)
    
    X, y = load_and_process_data()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42
    )

    combined_model, scaler, history = train_disease_models(X_train, y_train)

    evaluate_model_performance(combined_model, X_test, y_test, scaler)
    
    save_models(combined_model, scaler)
    
    # Default user input is "None", dont miss type it
    user_input = {
        'height': 160,
        'weight': 60,
        'gender': 1,  # 1=male, 0=female
        'age': 20,
        'blood_pressure': None,
        'cholesterol': None,
        'blood_glucose': None
    }
    
    for key in user_input:
        if user_input[key] is None:
            user_input[key] = avg_values[key]
            print(f"Using average value for {key}: {user_input[key]:.2f}")
    
    derived_features = calculate_derived_features(
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose']
    )
    
    user_input.update(derived_features)
    
    input_df = pd.DataFrame([user_input])
    
    expected_features = X.columns
    for feature in expected_features:
        if feature not in input_df:
            input_df[feature] = avg_values.get(feature, 0)
    
    predictions = predict_disease_risks(input_df, combined_model, scaler)
    
    print("\nDerived Features:")
    for feature, value in derived_features.items():
        print(f"{feature}: {value:.2f}")
    
    print("\nDisease Risk Predictions:")
    for disease, risk in predictions.items():
        print(f"{disease}: {risk:.2f}%")

if __name__ == "__main__":
    main()

Loading and processing data...
Dataset loaded with shape: (160892, 22)
Processing features...
Handling missing values...
Found 9 numeric features and 0 categorical features
Processing target variables...
Data processing completed!

=== Starting Model Training Process ===



Starting combined model training...
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 50: ReduceLROnPlateau reducing learning 




Detailed Performance Metrics:
--------------------------------------------------------------------------------
Disease              Accuracy   Precision  Recall     F1-Score  
--------------------------------------------------------------------------------
Anemia                   99.56%      0.00%      0.00%      0.00%
Cholesterol              83.77%     61.20%     80.58%     69.57%
CKD                      99.88%     90.91%     34.48%     50.00%
Diabetes                 85.82%     66.85%     71.39%     69.05%
Heart Disease            99.38%     94.77%     87.72%     91.11%
Hypertension             74.51%     66.26%     69.96%     68.06%
Metabolic Syndrome       99.47%     55.36%     17.61%     26.72%
NAFLD                    99.16%     46.15%      4.48%      8.16%
Obesity                 100.00%      0.00%      0.00%      0.00%
Stroke                   89.68%     68.23%     36.15%     47.26%

Average Metrics:
----------------------------------------
Accuracy       : 93.12%
Precision

  saving_api.save_model(


In [3]:
def calculate_derived_features(height, weight, gender, age, blood_pressure, cholesterol, blood_glucose):
    bmi = weight / ((height/100) ** 2)
    
    bmi_category = 0 if bmi < 18.5 else 1 if bmi < 25 else 2 if bmi < 30 else 3
    age_category = 0 if age < 30 else 1 if age < 45 else 2 if age < 60 else 3
    bp_category = 0 if blood_pressure < 120 else 1 if blood_pressure < 140 else 2 if blood_pressure < 160 else 3
    
    bmi_age = bmi * age
    bp_age = blood_pressure * age
    bmi_bp = bmi * blood_pressure
    
    return {
        'bmi': bmi,
        'bmi_category': bmi_category,
        'age_category': age_category,
        'bp_category': bp_category,
        'bmi_age': bmi_age,
        'bp_age': bp_age,
        'bmi_bp': bmi_bp
    }

def predict_health_status(user_input):
    # Load average values from dataset
    df = pd.read_csv('dataset/health_data1/combined_dataset.csv')
    
    # Map column names untuk average values
    column_mapping = {
        'height': 'height',
        'weight': 'weight',
        'gender': 'gender',
        'age': 'age',
        'blood_pressure': 'bp',
        'cholesterol': 'bc',
        'blood_glucose': 'bg'
    }
    
    # Calculate average values with proper column mapping
    avg_values = {}
    for key, col in column_mapping.items():
        if col in df.columns:
            avg_values[key] = df[col].mean()
        else:
            print(f"Warning: Column {col} not found in dataset")
            avg_values[key] = 0
    
    # Round gender average
    if 'gender' in avg_values:
        avg_values['gender'] = round(avg_values['gender'])
    
    # Replace None values with averages
    for key in ['blood_pressure', 'cholesterol', 'blood_glucose']:
        if user_input[key] is None:
            user_input[key] = avg_values[key]
            print(f"Using average value for {key}: {user_input[key]:.2f}")
    
    model, scaler = load_models()
    if model is None or scaler is None:
        return None
    
    try:
        features = {
            'height': user_input['height'],
            'weight': user_input['weight'],
            'gender': user_input['gender'],
            'age': user_input['age'],
            'bp': user_input['blood_pressure'],
            'bc': user_input['cholesterol'],
            'bg': user_input['blood_glucose'],
            'bmi': user_input['weight'] / ((user_input['height']/100) ** 2),
            'sodium': 1200  # default value
        }
        
        derived = calculate_derived_features(
            user_input['height'],
            user_input['weight'],
            user_input['gender'],
            user_input['age'],
            user_input['blood_pressure'],
            user_input['cholesterol'],
            user_input['blood_glucose']
        )
        features.update(derived)
        
        input_df = pd.DataFrame([features])
        input_scaled = scaler.transform(input_df)
        
        predictions = model.predict(input_scaled)
        
        diseases = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart', 
                   'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
        results = {disease: float(pred*100) for disease, pred in zip(diseases, predictions[0])}
        
        return results, derived['bmi']
        
    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        return None

if __name__ == "__main__":
    # input data user to check
    user_input = {
        'height': 165,
        'weight': 55,
        'gender': 1,  # 1 for male, 0 for female
        'age': 20,
        'blood_pressure': None,
        'cholesterol': None,
        'blood_glucose': None
    }
    
    print("\n=== Health Risk Assessment ===")
    results = predict_health_status(user_input)
    
    if results:
        predictions, bmi = results
        print(f"\nBMI: {bmi:.1f}")
        print("\nDisease Risk Predictions:")
        for disease, risk in predictions.items():
            print(f"{disease.upper()}: {risk:.1f}%")
    else:
        print("Failed to generate predictions. Please check if the model files exist.")


=== Health Risk Assessment ===
Using average value for blood_pressure: 130.83
Using average value for cholesterol: 98.87
Using average value for blood_glucose: 18.57

Loading model...
✓ Loaded combined model from HDF5
✓ Loaded scaler




BMI: 20.2

Disease Risk Predictions:
ANEMIA: 100.0%
CHOLESTEROL: 0.0%
CKD: 100.0%
DIABETES: 0.0%
HEART: 0.0%
HYPERTENSION: 0.0%
MS: 0.0%
NAFLD: 100.0%
OBESITY: 0.0%
STROKE: 0.0%
