In [29]:
import pandas as pd
import numpy as np
import os

In [30]:
def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def calculate_bmi(weight, height):
    if height > 0: # requirement tambahan biar data yang dimasukkan itu harus lebih dari 0
        return weight / ((height / 100) ** 2)
    else:
        return np.nan

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  exang  oldpeak  slope ca

  return pd.read_csv(file_path)


Successfully loaded health_data1_combined.csv
First few rows:
  gender  hemoglobin  age  blood_pressure  cholesterol  glucose  bmi  height  \
0    1.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
1    0.0        15.9  NaN             NaN          NaN      NaN  NaN     NaN   
2    0.0         9.0  NaN             NaN          NaN      NaN  NaN     NaN   
3    0.0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
4    1.0        14.7  NaN             NaN          NaN      NaN  NaN     NaN   

   weight  HDL  Height  Weight  
0     NaN  NaN     NaN     NaN  
1     NaN  NaN     NaN     NaN  
2     NaN  NaN     NaN     NaN  
3     NaN  NaN     NaN     NaN  
4     NaN  NaN     NaN     NaN  

Processing: heart-disease-dataset.csv
Successfully loaded heart-disease-dataset.csv
First few rows:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2

In [31]:
data_path = 'dataset\health_data1/'

anemia dataset preprocessing

In [32]:
anemia_data = load_data(data_path + 'anemia-dataset.csv')
anemia_data = anemia_data[['Gender', 'Hemoglobin']]
anemia_data.rename(columns={'Gender': 'gender', 'Hemoglobin': 'hemoglobin'}, inplace=True)
anemia_data['age'] = np.nan
anemia_data['blood_pressure'] = np.nan
anemia_data['cholesterol'] = np.nan
anemia_data['glucose'] = np.nan
anemia_data['bmi'] = np.nan

cholesterol dataset preprocessing

In [33]:
cholesterol_data = load_data(data_path + 'cholesterol-dataset.csv')
cholesterol_data = cholesterol_data[['age', 'sex', 'trestbps', 'chol']]
cholesterol_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
cholesterol_data['height'] = np.nan
cholesterol_data['weight'] = np.nan
cholesterol_data['glucose'] = np.nan
cholesterol_data['bmi'] = np.nan

chronic kidney disease dataset preprocessing

In [34]:
ckd_data = load_data(data_path + 'chronic-kidney-disease-dataset.csv')
ckd_data = ckd_data[['age', 'bp', 'bgr']]
ckd_data.rename(columns={
    'age': 'age',
    'bp': 'blood_pressure',
    'bgr': 'glucose'
}, inplace=True)
ckd_data['gender'] = np.nan
ckd_data['height'] = np.nan
ckd_data['weight'] = np.nan
ckd_data['cholesterol'] = np.nan
ckd_data['bmi'] = np.nan

diabetes dataset preprocessing

In [35]:
diabetes_data = load_data(data_path + 'diabetes-dataset.csv')
diabetes_data = diabetes_data[['Age', 'Sex', 'BMI', 'HighBP']]
diabetes_data.rename(columns={
    'Age': 'age',
    'Sex': 'gender',
    'BMI': 'bmi',
    'HighBP': 'blood_pressure'
}, inplace=True)
diabetes_data['height'] = np.nan
diabetes_data['weight'] = np.nan
diabetes_data['cholesterol'] = np.nan
diabetes_data['glucose'] = np.nan

heart disease dataset preprocessing

In [36]:
heart_disease_data = load_data(data_path + 'heart-disease-dataset.csv')
heart_disease_data = heart_disease_data[['age', 'sex', 'trestbps', 'chol']]
heart_disease_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
heart_disease_data['glucose'] = np.nan
heart_disease_data['height'] = np.nan
heart_disease_data['weight'] = np.nan
heart_disease_data['bmi'] = np.nan

hypertension dataset prprocessing

In [37]:
hypertension_data = load_data(data_path + 'hypertension-dataset.csv')
hypertension_data = hypertension_data[['age', 'sex', 'trestbps', 'chol']]
hypertension_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
hypertension_data['glucose'] = np.nan
hypertension_data['height'] = np.nan
hypertension_data['weight'] = np.nan
hypertension_data['bmi'] = np.nan

metabolic syndrome dataset preprocessing

In [38]:
metabolic_data = load_data(data_path + 'metabolic-syndrome-dataset.csv')
metabolic_data = metabolic_data[['Age', 'Sex', 'BloodGlucose', 'HDL', 'Triglycerides']]
metabolic_data.rename(columns={
    'Age': 'age',
    'Sex': 'gender',
    'BloodGlucose': 'glucose',
    'Triglycerides': 'cholesterol'  # asumsikan parameter triglycerides = cholesterol
}, inplace=True)
metabolic_data['height'] = np.nan
metabolic_data['weight'] = np.nan
metabolic_data['blood_pressure'] = np.nan
metabolic_data['bmi'] = np.nan

liver disease dataset preprocessing, using nafld1-dataset

In [39]:
nafld1_data = load_data(data_path + 'nafld1-dataset.csv')
nafld1_data = nafld1_data[['age', 'weight', 'height']]
nafld1_data['bmi'] = nafld1_data.apply(lambda x: calculate_bmi(x['weight'], x['height']), axis=1)
nafld1_data.rename(columns={'age': 'age'}, inplace=True)
nafld1_data['gender'] = np.nan
nafld1_data['blood_pressure'] = np.nan
nafld1_data['cholesterol'] = np.nan
nafld1_data['glucose'] = np.nan

obesity dataset preprocessing

In [40]:
obesity_data = load_data(data_path + 'obesity-dataset.csv')
obesity_data = obesity_data[['Age', 'Gender', 'Height', 'Weight']]
obesity_data['bmi'] = obesity_data.apply(lambda x: calculate_bmi(x['Weight'], x['Height']), axis=1)
obesity_data.rename(columns={'Age': 'age', 'Gender': 'gender'}, inplace=True)
obesity_data['blood_pressure'] = np.nan
obesity_data['cholesterol'] = np.nan
obesity_data['glucose'] = np.nan

stroke dataset preprocessing

In [41]:
stroke_data = load_data(data_path + 'stroke-dataset.csv')
stroke_data = stroke_data[['age', 'sex', 'avg_glucose_level', 'bmi']]
stroke_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'avg_glucose_level': 'glucose',
    'bmi': 'bmi'
}, inplace=True)
stroke_data['height'] = np.nan
stroke_data['weight'] = np.nan
stroke_data['blood_pressure'] = np.nan
stroke_data['cholesterol'] = np.nan

In [42]:
all_datasets = [
    anemia_data,
    cholesterol_data,
    ckd_data,
    diabetes_data,
    heart_disease_data,
    hypertension_data,
    metabolic_data,
    nafld1_data,
    obesity_data,
    stroke_data
]

# merge and save
final_data = pd.concat(all_datasets, ignore_index=True)

final_data.to_csv(data_path + 'health_data1_combined.csv', index=False)
print("Data successfully merged and saved")

Data successfully merged and saved


TRAINING MODEL

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

In [44]:
# Set working directory dan load data
def load_and_prepare_data():
    # Load dataset
    data_path = os.path.join('dataset', 'health_data1', 'health_data1_combined.csv')
    df = pd.read_csv(data_path)
    
    # Handle missing values
    df = df.fillna(df.mean())
    
    # Select features and target
    features = ['age', 'gender', 'height', 'weight', 'blood_pressure', 'cholesterol', 'glucose']
    target = 'bmi'  # bisa diganti sesuai kebutuhan
    
    X = df[features]
    y = df[target]
    
    # Convert categorical variables
    X = pd.get_dummies(X, columns=['gender'])
    
    return X, y

def train_model():
    # Load and prepare data
    X, y = load_and_prepare_data()
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    
    # Save model and scaler
    models_dir = 'models'
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    
    joblib.dump(model, os.path.join(models_dir, 'health_model.joblib'))
    joblib.dump(scaler, os.path.join(models_dir, 'scaler.joblib'))
    
    return model, scaler

def predict_health_params(model, scaler, input_data):
    # Prepare input data
    input_df = pd.DataFrame([input_data])
    input_df = pd.get_dummies(input_df, columns=['gender'])
    
    # Scale input
    input_scaled = scaler.transform(input_df)
    
    # Make prediction
    prediction = model.predict(input_scaled)
    
    return prediction[0]

if __name__ == "__main__":
    # Train model
    model, scaler = train_model()
    
    # Example prediction
    sample_input = {
        'age': 30,
        'gender': 'M',
        'height': 170,
        'weight': 70,
        'blood_pressure': 120,
        'cholesterol': 200,
        'glucose': 100
    }
    
    prediction = predict_health_params(model, scaler, sample_input)
    print(f"\nPredicted BMI: {prediction:.2f}")

  df = pd.read_csv(data_path)


TypeError: unsupported operand type(s) for +: 'float' and 'str'