In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt
import os
import json
import joblib
import time

# Step 0, chek dataset availability

def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

folder_path = 'dataset/health_data1/'

try:
    print("File inside the dataset folder:", os.listdir(folder_path))
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"Missing files: {', '.join(missing_files)}")
    else:
        print("All files are available.")

except FileNotFoundError:
    print(f"Folder missing: {folder_path}")


Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- combined_dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  

  return pd.read_csv(file_path)


Successfully loaded health_data1_combined.csv
First few rows:
  gender  hemoglobin  age  blood_pressure  cholesterol  glucose  bmi  height  \
0    1.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
1    0.0        15.9  NaN             NaN          NaN      NaN  NaN     NaN   
2    0.0         9.0  NaN             NaN          NaN      NaN  NaN     NaN   
3    0.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
4    1.0        14.7  NaN             NaN          NaN      NaN  NaN     NaN   

   weight  HDL  Height  Weight  
0     NaN  NaN     NaN     NaN  
1     NaN  NaN     NaN     NaN  
2     NaN  NaN     NaN     NaN  
3     NaN  NaN     NaN     NaN  
4     NaN  NaN     NaN     NaN  

Processing: heart-disease-dataset.csv
Successfully loaded heart-disease-dataset.csv
First few rows:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2

In [7]:
!conda activate py310


EnvironmentNameNotFound: Could not find conda environment: py310
You can list all discoverable environments with `conda info --envs`.




In [6]:
import tensorflow as tf

# Cek apakah GPU tersedia
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU tersedia: {gpus}")
else:
    print("Tidak ada GPU yang tersedia.")

Tidak ada GPU yang tersedia.


In [9]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Cek apakah GPU tersedia dan atur penggunaan GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU tersedia: {gpus}")
    except RuntimeError as e:
        print(e)
else:
    print("Tidak ada GPU yang tersedia, menggunakan CPU.")

def calculate_parameters(data):
    data['bmi'] = data['weight'] / (data['height'] ** 2)
    data['sodium'] = data['weight'] * 20
    data['fat'] = np.where(data['gender'] == 'Laki-laki', 
                           data['weight'] * 0.15, 
                           data['weight'] * 0.25)
    data['cholesterol'] = (data['bmi'] * 2) + (data['age'] * 0.15) + (data['bp'] * 0.05) + (data['bg'] * 0.02) + 150
    data['protein'] = data['weight'] * 0.9
    data['carbs'] = data['weight'] * 3
    return data

# Load dataset
data_path = 'dataset/health_data1/combined_dataset.csv'
data = pd.read_csv(data_path)

# Hitung parameter tambahan
data = calculate_parameters(data)

# Pilih fitur dan target
features = data[['height', 'weight', 'gender', 'age', 'bp', 
                 'bc', 'bg', 'bmi', 'sodium', 'fat', 
                 'cholesterol', 'protein', 'carbs']]
targets = data[['anemia', 'cholesterol', 'ckd', 'diabetes', 'heart', 
                 'hypertension', 'ms', 'nafld', 'obesity', 'stroke']]

# Encode gender
features['gender'] = features['gender'].map({'Laki-laki': 1, 'Perempuan': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# Standardisasi fitur
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Membangun model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(y_train.shape[1], activation='sigmoid')  # Output layer for multi-label classification
])

# Compile model dengan MSE sebagai loss
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

# Melatih model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Simpan model
model_dir = 'models/disease-prediction-model-fix'
os.makedirs(model_dir, exist_ok=True)

model.save(os.path.join(model_dir, 'disease-prediction-model-tf.h5'))

# Simpan arsitektur model ke JSON
model_json = model.to_json()
with open(os.path.join(model_dir, 'disease-prediction-model-tf.json'), 'w') as json_file:
    json_file.write(model_json)

print("Model telah disimpan.")

Tidak ada GPU yang tersedia, menggunakan CPU.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['gender'] = features['gender'].map({'Laki-laki': 1, 'Perempuan': 0})
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Epoch 1/100
Epoch 2/100
Epoch 3/100

KeyboardInterrupt: 