In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')
import joblib
import h5py
import json
import os

# Step 0, chek dataset availability

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

folder_path = 'dataset/health_data1/'

try:
    print("Daftar file di dalam folder:", os.listdir(folder_path))
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"File yang hilang: {', '.join(missing_files)}")
    else:
        print("Semua file dataset ditemukan.")

except FileNotFoundError:
    print(f"Folder tidak ditemukan: {folder_path}")

Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- combined_dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  e

In [23]:
# Step 1, preprocess available data and merge it in the end
columns = [
    "height", "weight", "gender", "age", "bp", "bc", "bg", "bmi", "sodium", 
    "fat", "protein", "carbs", "anemia", "cholesterol", "ckd", "diabetes", 
    "heart", "hypertension", "ms", "nafld", "obesity", "stroke"
]

def create_data_dict(**kwargs):
    base_dict = {
        "height": np.nan, "weight": np.nan, "gender": np.nan, "age": np.nan,
        "bp": np.nan, "bc": np.nan, "bg": np.nan, "bmi": np.nan,
        "sodium": np.nan, "fat": np.nan, "protein": np.nan, "carbs": np.nan,
        "anemia": 0, "cholesterol": 0, "ckd": 0, "diabetes": 0,
        "heart": 0, "hypertension": 0, "ms": 0, "nafld": 0, "obesity": 0, "stroke": 0
    }
    base_dict.update({k: v for k, v in kwargs.items() if v is not None})
    return base_dict

all_data = []

# 1. Anemia dataset
anemia_data = pd.read_csv(os.path.join(folder_path, "anemia-dataset.csv"))
all_data.extend([
    create_data_dict(
        gender=1 if row["Gender"] == "Male" else 0,
        bg=round(row["Hemoglobin"] * 7, 1),
        anemia=row["Result"]
    )
    for _, row in anemia_data.iterrows()
])

# 2. Cholesterol dataset
chol_data = pd.read_csv(os.path.join(folder_path, "cholesterol-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["sex"],
        bp=row["trestbps"],
        bc=row["chol"],
        bg=120 if row["fbs"] == 1 else 100,
        cholesterol=1 if row["chol"] > 240 else 0
    )
    for _, row in chol_data.iterrows()
])

# 3. Chronic Kidney Disease dataset
ckd_data = pd.read_csv(os.path.join(folder_path, "chronic-kidney-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        bp=row["bp"],
        bg=row["bgr"],
        sodium=row["sod"],
        anemia=1 if row["ane"] == "yes" else 0,
        ckd=1 if row["classification"] == "ckd" else 0,
        diabetes=1 if row["dm"] == "yes" else 0,
        heart=1 if row["cad"] == "yes" else 0,
        hypertension=1 if row["htn"] == "yes" else 0
    )
    for _, row in ckd_data.iterrows()
])

# 4. Diabetes dataset
diabetes_data = pd.read_csv(os.path.join(folder_path, "diabetes-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bmi=row["BMI"],
        cholesterol=row["HighChol"],
        diabetes=1 if row["Diabetes"] == 1 else 0,
        hypertension=row["HighBP"]
    )
    for _, row in diabetes_data.iterrows()
])

# 5. Heart Disease dataset
heart_data = pd.read_csv(os.path.join(folder_path, "heart-disease-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        heart=1 if row["target"] == 1 else 0
    )
    for _, row in heart_data.iterrows()
])

# 6. Hypertension dataset
hypertension_data = pd.read_csv(os.path.join(folder_path, "hypertension-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == 1 else 0,
        bp=row["trestbps"],
        bc=row["chol"],
        hypertension=1 if row["target"] == 1 else 0
    )
    for _, row in hypertension_data.iterrows()
])

# 7. Metabolic Syndrome dataset
ms_data = pd.read_csv(os.path.join(folder_path, "metabolic-syndrome-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Sex"] == "Male" else 0,
        bg=row["BloodGlucose"],
        bmi=row["BMI"],
        ms=1 if row["MetabolicSyndrome"] == 1 else 0
    )
    for _, row in ms_data.iterrows()
])

# 8. NAFLD dataset
nafld_data = pd.read_csv(os.path.join(folder_path, "nafld1-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=row["male"],
        weight=row["weight"],
        height=row["height"],
        bmi=round(row["bmi"],1),
        nafld=row["status"]
    )
    for _, row in nafld_data.iterrows()
])

# 9. Obesity dataset
obesity_data = pd.read_csv(os.path.join(folder_path, "obesity-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["Age"],
        gender=1 if row["Gender"] == "Male" else 0,
        weight=row["Weight"],
        height=row["Height"],
        bmi=row["BMI"],
        obesity=1 if row["Label"] == "Obesity" else 0
    )
    for _, row in obesity_data.iterrows()
])

# 10. Stroke dataset
stroke_data = pd.read_csv(os.path.join(folder_path, "stroke-dataset.csv"))
all_data.extend([
    create_data_dict(
        age=row["age"],
        gender=1 if row["sex"] == "Male" else 0,
        bc=row["heart_disease"],
        bmi=row["bmi"],
        heart=row["heart_disease"],
        hypertension=row["hypertension"],
        stroke=row["stroke"]
    )
    for _, row in stroke_data.iterrows()
])

combined_data = pd.DataFrame(all_data)

output_path = os.path.join(folder_path, "combined_dataset.csv")
combined_data.to_csv(output_path, index=False)

print(f"Combined dataset succesfully savd on {output_path}")
print("\ncombined dataset inform:")
print(combined_data.info())
print("\ncombined dataset stats:")
print(combined_data.describe())

Combined dataset succesfully savd on dataset/health_data1/combined_dataset.csv

combined dataset inform:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160892 entries, 0 to 160891
Data columns (total 22 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   height        14489 non-null   float64
 1   weight        12871 non-null   float64
 2   gender        160492 non-null  float64
 3   age           159462 non-null  float64
 4   bp            27799 non-null   float64
 5   bc            68321 non-null   float64
 6   bg            4481 non-null    float64
 7   bmi           126673 non-null  float64
 8   sodium        313 non-null     float64
 9   fat           0 non-null       float64
 10  protein       0 non-null       float64
 11  carbs         0 non-null       float64
 12  anemia        160892 non-null  float64
 13  cholesterol   160892 non-null  float64
 14  ckd           160892 non-null  int64  
 15  diabetes      160892 non-null  

In [24]:
# Step 2, load newly created combinedd dataset (combined_dataset.csv)
def load_and_process_data(file_path='dataset/health_data1/combined_dataset.csv'):
    df = pd.read_csv(file_path)
    
    available_features = ['height', 'weight', 'gender', 'age', 'bp', 'bc', 'bg', 'bmi', 'sodium']
    
    target_variables = ['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart',
                       'hypertension', 'ms', 'nafld', 'obesity', 'stroke']
    
    numeric_imputer = SimpleImputer(strategy='median')
    
    numeric_features = df[available_features].select_dtypes(include=['float64', 'int64'])
    
    imputed_numeric = pd.DataFrame(
        numeric_imputer.fit_transform(numeric_features),
        columns=numeric_features.columns,
        index=numeric_features.index
    )
    
    for col in numeric_features.columns:
        df[col] = imputed_numeric[col]
    
    df[target_variables] = df[target_variables].fillna(0)
    
    return df[available_features], df[target_variables]

In [25]:
# Step 3, create the calculation for the derived data from main user input data
def calculate_derived_features(height, weight, gender, age, blood_pressure, cholesterol, blood_glucose):
    # bmi calculation
    height_m = height / 100
    bmi = weight / (height_m ** 2)
    
    # sodium calculation
    sodium = weight * 20
    
    # fat based on gender calculatonn
    fat = weight * (0.15 if gender == 1 else 0.25)
    
    # chols level calc
    cholesterol_level = (bmi * 2) + (age * 0.15) + (blood_pressure * 0.05) + (blood_glucose * 0.02) + 150
    
    # protein calc
    protein = weight * 0.9
    
    # carbo calc
    carbs = weight * 3
    
    return {
        'bmi': bmi,
        'sodium': sodium,
        'fat': fat,
        'cholesterol_level': cholesterol_level,
        'protein': protein,
        'carbs': carbs
    }

In [26]:
# Step 4, model training set + save
def train_disease_models(X, y):
    """
    Fungsi yang diperbarui dengan parameter yang dioptimalkan
    """
    models = {}
    scaler = StandardScaler()
    
    X = np.array(X)
    
    X_scaled = scaler.fit_transform(X)
    
    base_params = {
        'n_estimators': 200,
        'max_depth': 15,
        'min_samples_split': 5,
        'min_samples_leaf': 2,
        'random_state': 42,
        'class_weight': 'balanced',
        'n_jobs': -1
    }
    
    disease_params = {
        'heart': {'n_estimators': 300, 'max_depth': 20},
        'hypertension': {'n_estimators': 300, 'max_depth': 20},
        'diabetes': {'n_estimators': 300, 'max_depth': 20},
        'cholesterol': {'n_estimators': 300, 'max_depth': 20}
    }
    
    for disease in y.columns:
        y_disease = y[disease].values
        
        if disease in ['heart', 'hypertension', 'diabetes', 'cholesterol']:
            smote = SMOTE(random_state=42)
            X_balanced, y_balanced = smote.fit_resample(X_scaled, y_disease)
        else:
            X_balanced, y_balanced = X_scaled, y_disease
        
        params = {**base_params, **disease_params.get(disease, {})}
        
        model = RandomForestClassifier(**params)
        model.fit(X_balanced, y_balanced)
        
        models[disease] = model
    
    return models, scaler

# def save_models(models, scaler, save_dir='models'):
#     if not os.path.exists(save_dir):
#         os.makedirs(save_dir)
    
#     for disease, model in models.items():
#         model_path = os.path.join(save_dir, f'{disease}_model.joblib')
#         joblib.dump(model, model_path)
    
#     scaler_path = os.path.join(save_dir, 'scaler.joblib')
#     joblib.dump(scaler, scaler_path)
#     print(f"Models and scaler saved in {save_dir}/")

def save_models(models, scaler, save_dir='models'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    models_path = os.path.join(save_dir, 'disease-prediction-model.joblib')
    joblib.dump(models, models_path)

    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print(f"Models saved in {models_path} and scaler saved in {scaler_path}/")


In [27]:
def evaluate_model_accuracy(models, X, y):
    print("accuracy model eval:\n")
    print("{:<15} {:<10} {:<10} {:<10} {:<10}".format(
        "Disease", "Accurancy", "Precision", "Recall", "F1-Score"
    ))
    print("-" * 55)
    
    overall_metrics = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0
    }
    
    if isinstance(X, pd.DataFrame):
        X = X.values
    
    for disease, model in models.items():
        try:
            y_pred = model.predict(X)
            y_true = y[disease]
            
            accuracy = accuracy_score(y_true, y_pred)
            precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
            recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
            
            overall_metrics['accuracy'] += accuracy
            overall_metrics['precision'] += precision
            overall_metrics['recall'] += recall
            overall_metrics['f1'] += f1
            
            print("{:<15} {:<10.2f} {:<10.2f} {:<10.2f} {:<10.2f}".format(
                disease,
                accuracy * 100,
                precision * 100,
                recall * 100,
                f1 * 100
            ))
            
        except Exception as e:
            print(f"Error evaluating {disease} model: {str(e)}")
    
    n_models = len(models)
    print("\nRata-rata Metrik:")
    print("Akurasi   : {:.2f}%".format(overall_metrics['accuracy'] / n_models * 100))
    print("Presisi   : {:.2f}%".format(overall_metrics['precision'] / n_models * 100))
    print("Recall    : {:.2f}%".format(overall_metrics['recall'] / n_models * 100))
    print("F1-Score  : {:.2f}%".format(overall_metrics['f1'] / n_models * 100))

# Import tambahan yang diperlukan
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Cara penggunaan:
def main():
    # Load dan preprocess data
    X, y = load_and_process_data()
    
    # Split data dengan stratifikasi
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42,
        stratify=y['heart']  # Stratifikasi berdasarkan penyakit dengan imbalance terburuk
    )
    
    # Train model
    models, scaler = train_disease_models(X_train, y_train)
    
    # Evaluasi model
    print("\nHasil Evaluasi pada Data Testing:")
    evaluate_model_accuracy(models, X_test, y_test)

if __name__ == "__main__":
    main()


Hasil Evaluasi pada Data Testing:
accuracy model eval:

Disease         Accurancy  Precision  Recall     F1-Score  
-------------------------------------------------------
anemia          99.54      99.09      99.54      99.32     
cholesterol     60.07      55.89      60.07      57.90     
ckd             99.82      99.63      99.82      99.73     
diabetes        78.31      61.33      78.31      68.79     
heart           16.88      79.57      16.88      27.81     
hypertension    60.71      36.86      60.71      45.87     
ms              99.53      99.07      99.53      99.30     
nafld           99.05      98.11      99.05      98.58     
obesity         100.00     100.00     100.00     100.00    
stroke          87.52      89.08      87.52      81.73     

Rata-rata Metrik:
Akurasi   : 80.14%
Presisi   : 81.86%
Recall    : 80.14%
F1-Score  : 77.90%


In [28]:
# Step 5 load model
def load_models(save_dir='models'):
    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Directory {save_dir} not found. Please train the models first.")
    
    models = {}
    for model_file in os.listdir(save_dir):
        if model_file.endswith('_model.joblib'):
            disease = model_file.replace('_model.joblib', '')
            model_path = os.path.join(save_dir, model_file)
            models[disease] = joblib.load(model_path)
    
    scaler_path = os.path.join(save_dir, 'scaler.joblib')
    if not os.path.exists(scaler_path):
        raise FileNotFoundError("Scaler file not found. Please train the models first.")
    scaler = joblib.load(scaler_path)
    
    print(f"Models and scaler loaded from {save_dir}/")
    return models, scaler

In [29]:
# Step 6, predictin the data
def predict_health_risks(user_input, models, scaler):
    derived = calculate_derived_features(
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose']
    )
    
    features = pd.DataFrame([[
        user_input['height'],
        user_input['weight'],
        user_input['gender'],
        user_input['age'],
        user_input['blood_pressure'],
        user_input['cholesterol'],
        user_input['blood_glucose'],
        derived['bmi'],
        derived['sodium']
    ]])
    
    features = features.fillna(method='ffill').fillna(method='bfill')
    
    features_scaled = scaler.transform(features)
    
    predictions = {}
    for disease, model in models.items():
        try:
            proba = model.predict_proba(features_scaled)[0]
            if len(proba) > 1:
                prob = proba[1]
            else:
                prob = model.predict(features_scaled)[0]
        except:
            prob = model.predict(features_scaled)[0]
        
        predictions[disease] = float(prob) * 100
    
    return predictions, derived

In [30]:
# Step 7, here we can input the user data
def get_average_values(df):
    return {
        'height': df['height'].mean(),
        'weight': df['weight'].mean(),
        'gender': round(df['gender'].mean()),
        'age': df['age'].mean(),
        'blood_pressure': df['bp'].mean(),
        'cholesterol': df['bc'].mean(),
        'blood_glucose': df['bg'].mean()
    }

def main():
    df = pd.read_csv('dataset/health_data1/combined_dataset.csv')
    avg_values = get_average_values(df)
    
    X, y = load_and_process_data()
    models, scaler = train_disease_models(X, y)
    
    save_models(models, scaler)
    
    # default param value is "None"
    user_input = {
        'height': 163,
        'weight': 60,
        'gender': 1, # 1=male, 0=female
        'age': 20,
        'blood_pressure': None,
        'cholesterol': None,
        'blood_glucose': None
    }
    
    for key in user_input:
        if user_input[key] is None:
            if key == 'blood_pressure':
                user_input[key] = avg_values['blood_pressure']
            elif key == 'blood_glucose':
                user_input[key] = avg_values['blood_glucose']
            elif key == 'cholesterol':
                user_input[key] = avg_values['cholesterol']
            else:
                user_input[key] = avg_values[key]
            print(f"Using average value for {key}: {user_input[key]:.2f}")
    
    predictions, derived_features = predict_health_risks(user_input, models, scaler)
    
    print("\nDerived Features:")
    for feature, value in derived_features.items():
        print(f"{feature}: {value:.2f}")
    
    print("\nDisease Risk Predictions:")
    for disease, risk in predictions.items():
        print(f"{disease}: {risk:.2f}%")

if __name__ == "__main__":
    main()

Models saved in models\disease-prediction-model.joblib and scaler saved in models\scaler.joblib/
Using average value for blood_pressure: 130.83
Using average value for cholesterol: 98.87
Using average value for blood_glucose: 106.50

Derived Features:
bmi: 22.58
sodium: 1200.00
fat: 9.00
cholesterol_level: 206.84
protein: 54.00
carbs: 180.00

Disease Risk Predictions:
anemia: 0.00%
cholesterol: 12.55%
ckd: 8.49%
diabetes: 3.03%
heart: 36.93%
hypertension: 6.18%
ms: 7.32%
nafld: 0.49%
obesity: 0.00%
stroke: 0.47%


In [31]:
# Step 8, convert saved model from joblib to hdf5
# def convert_all_joblib_to_hdf5(models_dir='models'):
#     for model_file in os.listdir(models_dir):
#         if model_file.endswith('_model.joblib'):
#             joblib_model_path = os.path.join(models_dir, model_file)
#             hdf5_model_path = os.path.join(models_dir, model_file.replace('.joblib', '.hdf5'))
            
#             model = joblib.load(joblib_model_path)
            
#             model_params = {
#                 'n_estimators': getattr(model, 'n_estimators', None),
#                 'max_depth': getattr(model, 'max_depth', None),
#                 'min_samples_split': getattr(model, 'min_samples_split', None),
#                 'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
#                 'feature_importances': getattr(model, 'feature_importances_', None)
#             }
            
#             with h5py.File(hdf5_model_path, 'w') as hdf:
#                 for key, value in model_params.items():
#                     if value is not None and (not isinstance(value, (list, np.ndarray)) or len(value) > 0):
#                         hdf.create_dataset(key, data=value)
#                     else:
#                         print(f"Warning: Parameter '{key}' is empty or None for model '{model_file}'. Skipping this parameter.")
            
#             print(f"Model '{model_file}' converted and saved to {hdf5_model_path}")

# def convert_all_joblib_to_json(models_dir='models'):
#     for model_file in os.listdir(models_dir):
#         if model_file.endswith('_model.joblib'):
#             joblib_model_path = os.path.join(models_dir, model_file)
#             json_model_path = os.path.join(models_dir, model_file.replace('.joblib', '.json'))
            
#             model = joblib.load(joblib_model_path)
            
#             model_params = {
#                 'n_estimators': getattr(model, 'n_estimators', None),
#                 'max_depth': getattr(model, 'max_depth', None),
#                 'min_samples_split': getattr(model, 'min_samples_split', None),
#                 'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
#                 'feature_importances': getattr(model, 'feature_importances_', None)
#             }
            
#             for key, value in model_params.items():
#                 if isinstance(value, (np.ndarray)):
#                     model_params[key] = value.tolist()
            
#             with open(json_model_path, 'w') as json_file:
#                 json.dump(model_params, json_file, indent=4)
            
#             print(f"Model '{model_file}' converted and saved to {json_model_path}")

# convert_all_joblib_to_hdf5()
# convert_all_joblib_to_json()

def convert_model_to_hdf5_n_json(models_dir='models'):
    models_path = os.path.join(models_dir, 'disease-prediction-model.joblib')
    hdf5_model_path = os.path.join(models_dir, 'disease-prediction-model.hdf5')
    json_model_path = os.path.join(models_dir, 'disease-prediction-model.json')

    models = joblib.load(models_path)

    with h5py.File(hdf5_model_path, 'w') as hdf:
        model_params_dict = {}
        for disease, model in models.items():
            model_params = {
                'n_estimators': getattr(model, 'n_estimators', None),
                'max_depth': getattr(model, 'max_depth', None),
                'min_samples_split': getattr(model, 'min_samples_split', None),
                'min_samples_leaf': getattr(model, 'min_samples_leaf', None),
                'feature_importances': getattr(model, 'feature_importances_', None)
            }
            model_params_dict[disease] = {key: value.tolist() if isinstance(value, np.ndarray) else value for key, value in model_params.items()}

            for key, value in model_params.items():
                if value is not None and (not isinstance(value, (list, np.ndarray)) or len(value) > 0):
                    hdf.create_dataset(f"{disease}/{key}", data=value)
                else:
                    print(f"Warning: Parameter '{key}' is empty or None for model '{disease}'. Skipping this parameter.")

    with open(json_model_path, 'w') as json_file:
        json.dump(model_params_dict, json_file, indent=4)

    print(f"Model converted and saved to {hdf5_model_path} and parameters saved to {json_model_path}")

convert_model_to_hdf5_n_json()

Model converted and saved to models\disease-prediction-model.hdf5 and parameters saved to models\disease-prediction-model.json
