In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import os

# Step 0: Check if datasets exist

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def calculate_bmi(weight, height):
    if height > 0: # requirement tambahan biar data yang dimasukkan itu harus lebih dari 0
        return weight / ((height / 100) ** 2)
    else:
        return np.nan

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

folder_path = 'dataset/health_data1/'  # Ganti dengan jalur folder yang sesuai

# Cek apakah folder dan file-file dataset ada
try:
    print("Daftar file di dalam folder:", os.listdir(folder_path))
    # List semua file yang diperlukan
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    # Cek keberadaan masing-masing file
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"File yang hilang: {', '.join(missing_files)}")
    else:
        print("Semua file dataset ditemukan.")

except FileNotFoundError:
    print(f"Folder tidak ditemukan: {folder_path}")

Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  exang  oldpeak  slope ca

  return pd.read_csv(file_path)


Successfully loaded health_data1_combined.csv
First few rows:
  gender  hemoglobin  age  blood_pressure  cholesterol  glucose  bmi  height  \
0    1.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
1    0.0        15.9  NaN             NaN          NaN      NaN  NaN     NaN   
2    0.0         9.0  NaN             NaN          NaN      NaN  NaN     NaN   
3    0.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
4    1.0        14.7  NaN             NaN          NaN      NaN  NaN     NaN   

   weight  HDL  Height  Weight  
0     NaN  NaN     NaN     NaN  
1     NaN  NaN     NaN     NaN  
2     NaN  NaN     NaN     NaN  
3     NaN  NaN     NaN     NaN  
4     NaN  NaN     NaN     NaN  

Processing: heart-disease-dataset.csv
Successfully loaded heart-disease-dataset.csv
First few rows:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2

In [9]:
# Step 1: Load all datasets
datasets = {
    "anemia": 'dataset/health_data1/anemia-dataset.csv',
    "cholesterol": 'dataset/health_data1/cholesterol-dataset.csv',
    "kidney_disease": 'dataset/health_data1/chronic-kidney-disease-dataset.csv',
    "diabetes": 'dataset/health_data1/diabetes-dataset.csv',
    "heart_disease": 'dataset/health_data1/heart-disease-dataset.csv',
    "hypertension": 'dataset/health_data1/hypertension-dataset.csv',
    "metabolic_syndrome": 'dataset/health_data1/metabolic-syndrome-dataset.csv',
    "nafld": 'dataset/health_data1/nafld1-dataset.csv',
    "obesity": 'dataset/health_data1/obesity-dataset.csv',
    "stroke": 'dataset/health_data1/stroke-dataset.csv'
}

In [10]:
# Step 2: Calculate derived features based on user input
def calculate_derived_features(user_input):
    # Extract user input
    height = user_input['height']
    weight = user_input['weight']
    age = user_input['age']
    gender = user_input['gender']
    blood_pressure = user_input['blood_pressure']
    cholesterol = user_input['cholesterol']
    blood_sugar = user_input['blood_sugar']

    # Calculate derived features
    bmi = weight / (height / 100) ** 2
    sodium = weight * 20
    if gender == 'Male':
        fat = weight * 0.15
    else:
        fat = weight * 0.25
    cholesterol_level = (bmi * 2) + (age * 0.15) + (blood_pressure * 0.05) + (blood_sugar * 0.02) + 150
    protein = weight * 0.9
    carbs = weight * 3

    # Return calculated features as a dictionary
    derived_features = {
        'bmi': bmi,
        'sodium': sodium,
        'fat': fat,
        'cholesterol': cholesterol_level,
        'protein': protein,
        'carbs': carbs
    }
    
    return derived_features

In [11]:
# Step 3
def prepare_data_for_model(datasets):
    features_list = []
    target_list = []

    for dataset_name, dataset in datasets.items():
        # Load dataset
        df = pd.read_csv(dataset)
        
        # Create temporary dataframe untuk fitur yang diperlukan
        temp_df = pd.DataFrame()
        
        # Set target berdasarkan nama dataset
        if dataset_name == "stroke":
            target_col = 'stroke'
        elif dataset_name == "heart_disease":
            target_col = 'target'
        elif dataset_name == "diabetes":
            target_col = 'Diabetes'
        elif dataset_name == "kidney_disease":
            target_col = 'classification'
        elif dataset_name == "cholesterol":
            target_col = 'num'
        elif dataset_name == "anemia":
            target_col = 'Result'
        elif dataset_name == "hypertension":
            target_col = 'target'
        elif dataset_name == "metabolic_syndrome":
            target_col = 'MetabolicSyndrome'
        elif dataset_name == "nafld":
            target_col = 'status'
        elif dataset_name == "obesity":
            target_col = 'Label'
            
        # Pastikan kolom target ada
        if target_col not in df.columns:
            print(f"Warning: {target_col} not found in {dataset_name}, skipping dataset")
            continue
            
        # Simpan nilai target
        target = df[target_col]
        
        # Pilih dan standardisasi fitur
        if 'age' in df.columns:
            temp_df['age'] = df['age']
        if 'gender' in df.columns or 'sex' in df.columns:
            temp_df['gender'] = df['gender'] if 'gender' in df.columns else df['sex']
        if 'bmi' in df.columns:
            temp_df['bmi'] = df['bmi']
        if 'blood_pressure' in df.columns or 'trestbps' in df.columns:
            temp_df['blood_pressure'] = df['blood_pressure'] if 'blood_pressure' in df.columns else df['trestbps']
        if 'cholesterol' in df.columns or 'chol' in df.columns:
            temp_df['cholesterol'] = df['cholesterol'] if 'cholesterol' in df.columns else df['chol']
            
        # Hanya tambahkan dataset jika memiliki minimal 3 fitur yang tidak null
        if temp_df.notna().sum(axis=1).mean() >= 3:
            features_list.append(temp_df)
            target_list.append(target)

    # Gabungkan semua dataset
    if not features_list:
        raise ValueError("No valid datasets found")
        
    features = pd.concat(features_list, ignore_index=True)
    labels = pd.concat(target_list, ignore_index=True)
    
    # Handle missing values
    features = features.fillna(features.mean())
    
    # Standardisasi fitur numerik
    numeric_cols = ['age', 'bmi', 'blood_pressure', 'cholesterol']
    for col in numeric_cols:
        if col in features.columns:
            features[col] = (features[col] - features[col].mean()) / features[col].std()
    
    # Konversi gender ke numeric
    features['gender'] = features['gender'].map({'Male': 1, 'Female': 0, 'M': 1, 'F': 0, 1: 1, 0: 0})
    features['gender'] = features['gender'].fillna(0)
    
    return features, labels

In [12]:
# Step 4: Train the model and save it to disk
def train_and_save_model(data, labels, model_path='health_disease_predictor_model.pkl'):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

    # Initialize RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # Save the model to disk
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

    return model


In [13]:
# Step 5: Load model and predict disease risk for a new user
def predict_disease_risk(user_input, model_path='health_disease_predictor_model.pkl'):
    # Load the saved model
    model = joblib.load(model_path)
    print(f"Model loaded from {model_path}")
    
    # Calculate derived features for user
    derived_features = calculate_derived_features(user_input)
    
    # Create a dataframe for the model input
    user_data = pd.DataFrame([derived_features])
    
    # Predict the disease risk
    prediction = model.predict(user_data)
    
    return prediction


# Example of user input
user_input = {
    'height': 170,  # in cm
    'weight': 70,  # in kg
    'age': 30,
    'gender': 'Male',
    'blood_pressure': 120,
    'cholesterol': 200,
    'blood_sugar': 90
}

# Prepare the data for model training
data, labels = prepare_data_for_model(datasets)

# Train and save the model
train_and_save_model(data, labels)

# Predict disease risk for a new user
prediction = predict_disease_risk(user_input)
print("Predicted disease risk:", prediction)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3762
           1       0.96      0.97      0.97      4441
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         9
           4       0.00      0.00      0.00         3

    accuracy                           0.96      8224
   macro avg       0.38      0.38      0.38      8224
weighted avg       0.96      0.96      0.96      8224

Model saved to health_disease_predictor_model.pkl
Model loaded from health_disease_predictor_model.pkl


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- bmi
- carbs
- fat
- protein
- sodium
Feature names seen at fit time, yet now missing:
- age
- blood_pressure
- gender
