In [6]:
import pandas as pd
import numpy as np
import os

In [7]:
def set_project_directory():
    current_dir = os.getcwd()
    
    if os.path.basename(current_dir) == 'scripts':
        os.chdir('..')
    
    print(f"Working directory set to: {os.getcwd()}")

def calculate_bmi(weight, height):
    if height > 0: # requirement tambahan biar data yang dimasukkan itu harus lebih dari 0
        return weight / ((height / 100) ** 2)
    else:
        return np.nan

def check_data_directory():
    data_dir = os.path.join('dataset', 'health_data1')
    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        print("Available directories:", os.listdir('.'))
        return False
    
    print("\nAvailable files in directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")
    
    return True

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.XPT'):
            return pd.read_sas(file_path)
        else:
            print(f"Unsupported file format: {file_path}")
            return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

set_project_directory()

if check_data_directory():
    data_directory = os.path.join('dataset', 'health_data1')
    
    for filename in os.listdir(data_directory):
        file_path = os.path.join(data_directory, filename)
        print(f"\nProcessing: {filename}")
        data = load_data(file_path)
        if data is not None:
            print(f"Successfully loaded {filename}")
            print("First few rows:")
            print(data.head())
else:
    print("Please check your directory structure and file locations")

Working directory set to: c:\Users\Dana\Documents\Kuliah\Bangkit\Capstone-C242-PS384_Project01

Available files in directory:
- anemia-dataset.csv
- cholesterol-dataset.csv
- chronic-kidney-disease-dataset.csv
- diabetes-dataset.csv
- health_data1_combined.csv
- heart-disease-dataset.csv
- hypertension-dataset.csv
- metabolic-syndrome-dataset.csv
- nafld1-dataset.csv
- nafld2-dataset.csv
- nwtco-dataset.csv
- obesity-dataset.csv
- stroke-dataset.csv

Processing: anemia-dataset.csv
Successfully loaded anemia-dataset.csv
First few rows:
   Gender  Hemoglobin   MCH  MCHC   MCV  Result
0       1        14.9  22.7  29.1  83.7       0
1       0        15.9  25.4  28.3  72.0       0
2       0         9.0  21.5  29.6  71.2       1
3       0        14.9  16.0  31.4  87.5       0
4       1        14.7  22.0  28.2  99.5       0

Processing: cholesterol-dataset.csv
Successfully loaded cholesterol-dataset.csv
First few rows:
   age  sex  cp  trestbps  fbs  restecg  thalach  exang  oldpeak  slope ca

  return pd.read_csv(file_path)


Successfully loaded nafld1-dataset.csv
First few rows:
   Unnamed: 0  id  age  male  weight  height        bmi  case.id  futime  \
0        3631   1   57     0    60.0   163.0  22.690939  10630.0    6261   
1        8458   2   67     0    70.4   168.0  24.884028  14817.0     624   
2        6298   3   53     1   105.8   186.0  30.453537      3.0    1783   
3       15398   4   56     1   109.3   170.0  37.830100   6628.0    3143   
4       13261   5   68     1     NaN     NaN        NaN   1871.0    1836   

   status  
0       0  
1       0  
2       0  
3       0  
4       1  

Processing: nafld2-dataset.csv
Successfully loaded nafld2-dataset.csv
First few rows:
   Unnamed: 0  id  days  test  value
0      135077   1  -459   hdl   75.0
1      313143   1  -459  chol   75.0
2      135078   1   183   hdl   64.0
3      313144   1   183  chol   64.0
4      135079   1  2030   hdl   74.0

Processing: nwtco-dataset.csv
Successfully loaded nwtco-dataset.csv
First few rows:
   Unnamed: 0  seqno  

In [8]:
data_path = 'dataset\health_data1/'

anemia dataset preprocessing

In [9]:
anemia_data = load_data(data_path + 'anemia-dataset.csv')
anemia_data = anemia_data[['Gender', 'Hemoglobin']]
anemia_data.rename(columns={'Gender': 'gender', 'Hemoglobin': 'hemoglobin'}, inplace=True)
anemia_data['age'] = np.nan
anemia_data['blood_pressure'] = np.nan
anemia_data['cholesterol'] = np.nan
anemia_data['glucose'] = np.nan
anemia_data['bmi'] = np.nan

cholesterol dataset preprocessing

In [10]:
cholesterol_data = load_data(data_path + 'cholesterol-dataset.csv')
cholesterol_data = cholesterol_data[['age', 'sex', 'trestbps', 'chol']]
cholesterol_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
cholesterol_data['height'] = np.nan
cholesterol_data['weight'] = np.nan
cholesterol_data['glucose'] = np.nan
cholesterol_data['bmi'] = np.nan

chronic kidney disease dataset preprocessing

In [11]:
ckd_data = load_data(data_path + 'chronic-kidney-disease-dataset.csv')
ckd_data = ckd_data[['age', 'bp', 'bgr']]
ckd_data.rename(columns={
    'age': 'age',
    'bp': 'blood_pressure',
    'bgr': 'glucose'
}, inplace=True)
ckd_data['gender'] = np.nan
ckd_data['height'] = np.nan
ckd_data['weight'] = np.nan
ckd_data['cholesterol'] = np.nan
ckd_data['bmi'] = np.nan

diabetes dataset preprocessing

In [12]:
diabetes_data = load_data(data_path + 'diabetes-dataset.csv')
diabetes_data = diabetes_data[['Age', 'Sex', 'BMI', 'HighBP']]
diabetes_data.rename(columns={
    'Age': 'age',
    'Sex': 'gender',
    'BMI': 'bmi',
    'HighBP': 'blood_pressure'
}, inplace=True)
diabetes_data['height'] = np.nan
diabetes_data['weight'] = np.nan
diabetes_data['cholesterol'] = np.nan
diabetes_data['glucose'] = np.nan

heart disease dataset preprocessing

In [13]:
heart_disease_data = load_data(data_path + 'heart-disease-dataset.csv')
heart_disease_data = heart_disease_data[['age', 'sex', 'trestbps', 'chol']]
heart_disease_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
heart_disease_data['glucose'] = np.nan
heart_disease_data['height'] = np.nan
heart_disease_data['weight'] = np.nan
heart_disease_data['bmi'] = np.nan

hypertension dataset prprocessing

In [14]:
hypertension_data = load_data(data_path + 'hypertension-dataset.csv')
hypertension_data = hypertension_data[['age', 'sex', 'trestbps', 'chol']]
hypertension_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'trestbps': 'blood_pressure',
    'chol': 'cholesterol'
}, inplace=True)
hypertension_data['glucose'] = np.nan
hypertension_data['height'] = np.nan
hypertension_data['weight'] = np.nan
hypertension_data['bmi'] = np.nan

metabolic syndrome dataset preprocessing

In [15]:
metabolic_data = load_data(data_path + 'metabolic-syndrome-dataset.csv')
metabolic_data = metabolic_data[['Age', 'Sex', 'BloodGlucose', 'HDL', 'Triglycerides']]
metabolic_data.rename(columns={
    'Age': 'age',
    'Sex': 'gender',
    'BloodGlucose': 'glucose',
    'Triglycerides': 'cholesterol'  # asumsikan parameter triglycerides = cholesterol
}, inplace=True)
metabolic_data['height'] = np.nan
metabolic_data['weight'] = np.nan
metabolic_data['blood_pressure'] = np.nan
metabolic_data['bmi'] = np.nan

liver disease dataset preprocessing, using nafld1-dataset

In [16]:
nafld1_data = load_data(data_path + 'nafld1-dataset.csv')
nafld1_data = nafld1_data[['age', 'weight', 'height']]
nafld1_data['bmi'] = nafld1_data.apply(lambda x: calculate_bmi(x['weight'], x['height']), axis=1)
nafld1_data.rename(columns={'age': 'age'}, inplace=True)
nafld1_data['gender'] = np.nan
nafld1_data['blood_pressure'] = np.nan
nafld1_data['cholesterol'] = np.nan
nafld1_data['glucose'] = np.nan

obesity dataset preprocessing

In [17]:
obesity_data = load_data(data_path + 'obesity-dataset.csv')
obesity_data = obesity_data[['Age', 'Gender', 'Height', 'Weight']]
obesity_data['bmi'] = obesity_data.apply(lambda x: calculate_bmi(x['Weight'], x['Height']), axis=1)
obesity_data.rename(columns={'Age': 'age', 'Gender': 'gender'}, inplace=True)
obesity_data['blood_pressure'] = np.nan
obesity_data['cholesterol'] = np.nan
obesity_data['glucose'] = np.nan

stroke dataset preprocessing

In [18]:
stroke_data = load_data(data_path + 'stroke-dataset.csv')
stroke_data = stroke_data[['age', 'sex', 'avg_glucose_level', 'bmi']]
stroke_data.rename(columns={
    'age': 'age',
    'sex': 'gender',
    'avg_glucose_level': 'glucose',
    'bmi': 'bmi'
}, inplace=True)
stroke_data['height'] = np.nan
stroke_data['weight'] = np.nan
stroke_data['blood_pressure'] = np.nan
stroke_data['cholesterol'] = np.nan

In [19]:
all_datasets = [
    anemia_data,
    cholesterol_data,
    ckd_data,
    diabetes_data,
    heart_disease_data,
    hypertension_data,
    metabolic_data,
    nafld1_data,
    obesity_data,
    stroke_data
]

# merge and save
final_data = pd.concat(all_datasets, ignore_index=True)

final_data.to_csv(data_path + 'health_data1_combined.csv', index=False)
print("Data successfully merged and saved")

Data successfully merged and saved


TRAINING MODEL

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

In [21]:
def prepare_disease_data():
    data_path = os.path.join('dataset', 'health_data1', 'health_data1_combined.csv')
    df = pd.read_csv(data_path, low_memory=False)
    
    disease_columns = ['diabetes', 'heart_disease', 'hypertension', 'stroke', 'kidney_disease']
    disease_targets = {}
    
    for disease in disease_columns:
        if disease in df.columns:
            disease_targets[disease] = df[disease]
            print(f"\nFound '{disease}' column with values: {df[disease].unique()}")
    
    return df, disease_targets

def train_disease_models():
    print("\nStarting model training...")
    main_df, disease_targets = prepare_disease_data()
    
    # Preprocessing
    le = LabelEncoder()
    main_df['gender'] = le.fit_transform(main_df['gender'].astype(str))
    
    numeric_cols = ['age', 'height', 'weight', 'blood_pressure', 'cholesterol', 'glucose']
    feature_cols = numeric_cols + ['gender']
    
    for col in numeric_cols:
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')
        main_df[col].fillna(main_df[col].mean(), inplace=True)
    
    X = main_df[feature_cols]
    print("\nFeature matrix shape:", X.shape)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    models = {}
    for disease, target in disease_targets.items():
        print(f"\nTraining model for {disease}")
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, target, test_size=0.2, random_state=42
            )
            
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            print(f"Model for {disease}:")
            print(f"Accuracy: {accuracy:.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            models[disease] = model
            
        except Exception as e:
            print(f"Error training {disease} model: {e}")
    
    return models, scaler, feature_cols

# Train models
models, scaler, feature_cols = train_disease_models()


Starting model training...

Feature matrix shape: (160892, 7)


In [22]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2)
plt.xlabel("Actual BMI")
plt.ylabel("Predicted BMI")
plt.title("Actual vs Predicted BMI")
plt.show()

NameError: name 'y_test' is not defined

<Figure size 800x600 with 0 Axes>

Coba test user input

In [26]:
def predict_diseases(input_data, models, scaler, feature_cols):
    input_df = pd.DataFrame([input_data], columns=feature_cols)
    input_scaled = scaler.transform(input_df)
    
    predictions = {}
    for disease, model in models.items():
        try:
            prob = model.predict_proba(input_scaled)[0][1]
            predictions[disease] = prob * 100
        except Exception as e:
            print(f"Error predicting {disease}: {e}")
    
    return predictions

def get_user_input():
    print("\n--- Input User Data ---")
    age = float(input("Age (years): "))
    gender = input("Gender (Male/Female): ").strip().lower()
    gender = 1 if gender == 'male' else 0
    height = float(input("Height (cm): "))
    weight = float(input("Weight (kg): "))
    blood_pressure = float(input("Blood Pressure (mmHg): "))
    glucose = float(input("Glucose Level (mg/dL): "))
    cholesterol = float(input("Cholesterol Level (mg/dL): "))

    return [age, height, weight, blood_pressure, cholesterol, glucose, gender]

# Get user input and predict
user_input = get_user_input()
predictions = predict_diseases(user_input, models, scaler, feature_cols)

print("\nPrediction Results:")
for disease, probability in predictions.items():
    print(f"{disease.replace('_', ' ').title()}: {probability:.2f}%")


--- Input User Data ---

Prediction Results:


In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import os

def check_dataset():
    data_path = os.path.join('dataset', 'health_data1', 'health_data1_combined.csv')
    df = pd.read_csv(data_path, low_memory=False)
    
    print("Dataset Info:")
    print("\nColumns:", df.columns.tolist())
    print("\nSample counts:")
    for col in ['diabetes', 'heart_disease', 'hypertension', 'stroke', 'kidney_disease']:
        if col in df.columns:
            print(f"{col}: {df[col].value_counts().to_dict()}")
        else:
            print(f"{col}: Not found in dataset")
    
    return df

def predict_health(input_data, bmi_model, disease_models, scaler, feature_cols):
    # Prepare input data
    input_df = pd.DataFrame([input_data], columns=feature_cols)
    input_scaled = scaler.transform(input_df)
    
    # Predict BMI
    bmi_pred = bmi_model.predict(input_scaled)[0]
    bmi_category = get_bmi_category(bmi_pred)
    
    # Debug print
    print("\nDebug - Available disease models:", list(disease_models.keys()))
    
    # Predict diseases
    disease_predictions = {}
    for disease, model in disease_models.items():
        try:
            prob = model.predict_proba(input_scaled)[0][1]
            disease_predictions[disease] = prob * 100
            print(f"Debug - Successfully predicted {disease}")
        except Exception as e:
            print(f"Debug - Error predicting {disease}: {e}")
    
    return bmi_pred, bmi_category, disease_predictions

def train_models():
    # Load data
    data_path = os.path.join('dataset', 'health_data1', 'health_data1_combined.csv')
    df = pd.read_csv(data_path, low_memory=False)
    
    # Preprocessing
    le = LabelEncoder()
    df['gender'] = le.fit_transform(df['gender'].astype(str))
    
    numeric_cols = ['age', 'height', 'weight', 'blood_pressure', 'cholesterol', 'glucose']
    feature_cols = numeric_cols + ['gender']
    disease_columns = ['diabetes', 'heart_disease', 'hypertension', 'stroke', 'kidney_disease']
    
    # Handle missing values
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].mean(), inplace=True)
    
    # Prepare features
    X = df[feature_cols]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train BMI model
    df['bmi'].fillna(df['bmi'].mean(), inplace=True)
    bmi_model = RandomForestRegressor(n_estimators=100, random_state=42)
    bmi_model.fit(X_scaled, df['bmi'])
    
    # Train disease models
    disease_models = {}
    print("\nTraining disease models:")
    for disease in disease_columns:
        if disease in df.columns:
            print(f"\nProcessing {disease}...")
            df[disease].fillna(0, inplace=True)
            y = df[disease].astype(int)
            print(f"Value counts: {y.value_counts().to_dict()}")
            
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_scaled, y)
            disease_models[disease] = model
            print(f"Model trained successfully for {disease}")
        else:
            print(f"{disease} not found in dataset")
    
    return bmi_model, disease_models, scaler, feature_cols

def get_bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal weight"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

def get_user_input():
    print("\n--- Input User Data ---")
    age = float(input("Age (years): "))
    gender = input("Gender (Male/Female): ").strip().lower()
    gender = 1 if gender == 'male' else 0
    height = float(input("Height (cm): "))
    weight = float(input("Weight (kg): "))
    blood_pressure = float(input("Blood Pressure (mmHg): "))
    glucose = float(input("Glucose Level (mg/dL): "))
    cholesterol = float(input("Cholesterol Level (mg/dL): "))
    
    return [age, height, weight, blood_pressure, cholesterol, glucose, gender]



# Train models
bmi_model, disease_models, scaler, feature_cols = train_models()

# Get user input and predict
user_input = get_user_input()
bmi_pred, bmi_category, predictions = predict_health(user_input, bmi_model, disease_models, scaler, feature_cols)

# Display results
print("\n=== Health Prediction Results ===")
print(f"\nBMI Prediction: {bmi_pred:.1f}")
print(f"BMI Category: {bmi_category}")

print("\nDisease Risk Predictions:")
for disease, probability in predictions.items():
    print(f"{disease.replace('_', ' ').title()}: {probability:.1f}%")


Training disease models:
diabetes not found in dataset
heart_disease not found in dataset
hypertension not found in dataset
stroke not found in dataset
kidney_disease not found in dataset

--- Input User Data ---

Debug - Available disease models: []

=== Health Prediction Results ===

BMI Prediction: 24.4
BMI Category: Normal weight

Disease Risk Predictions:


In [31]:
def analyze_dataset():
    data_path = os.path.join('dataset', 'health_data1', 'health_data1_combined.csv')
    df = pd.read_csv(data_path, low_memory=False)
    
    print("Dataset Info:")
    print("\nColumns available:", df.columns.tolist())
    print("\nSample size:", len(df))
    print("\nMissing values:")
    print(df.isnull().sum())
    print("\nFirst few rows:")
    print(df.head())
    
    return df

# Run analysis
df = analyze_dataset()

Dataset Info:

Columns available: ['gender', 'hemoglobin', 'age', 'blood_pressure', 'cholesterol', 'glucose', 'bmi', 'height', 'weight', 'HDL', 'Height', 'Weight']

Sample size: 160892

Missing values:
gender             17977
hemoglobin        159471
age                 1430
blood_pressure     62401
cholesterol       131080
glucose           117225
bmi                36594
height            146511
weight            148129
HDL               158491
Height            160784
Weight            160784
dtype: int64

First few rows:
  gender  hemoglobin  age  blood_pressure  cholesterol  glucose  bmi  height  \
0      1        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
1      0        15.9  NaN             NaN          NaN      NaN  NaN     NaN   
2      0         9.0  NaN             NaN          NaN      NaN  NaN     NaN   
3      0        14.9  NaN             NaN          NaN      NaN  NaN     NaN   
4      1        14.7  NaN             NaN          NaN      NaN  NaN

In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import os

# Step 0: Check if datasets exist
folder_path = 'dataset/health_data1'  # Ganti dengan jalur folder yang sesuai

# Cek apakah folder dan file-file dataset ada
try:
    print("Daftar file di dalam folder:", os.listdir(folder_path))
    # List semua file yang diperlukan
    required_files = [
        'anemia-dataset.csv',
        'cholesterol-dataset.csv',
        'chronic-kidney-disease-dataset.csv',
        'diabetes-dataset.csv',
        'heart-disease-dataset.csv',
        'hypertension-dataset.csv',
        'metabolic-syndrome-dataset.csv',
        'nafld1-dataset.csv',
        'obesity-dataset.csv',
        'stroke-dataset.csv'
    ]
    
    # Cek keberadaan masing-masing file
    missing_files = [f for f in required_files if not os.path.isfile(os.path.join(folder_path, f))]
    
    if missing_files:
        print(f"File yang hilang: {', '.join(missing_files)}")
    else:
        print("Semua file dataset ditemukan.")

except FileNotFoundError:
    print(f"Folder tidak ditemukan: {folder_path}")

Folder tidak ditemukan: dataset/health_data1


In [28]:
# Step 1: Load all datasets
datasets = {
    "anemia": 'dataset/health_data1/anemia-dataset.csv',
    "cholesterol": 'dataset/health_data1/cholesterol-dataset.csv',
    "kidney_disease": 'dataset/health_data1/chronic-kidney-disease-dataset.csv',
    "diabetes": 'dataset/health_data1/diabetes-dataset.csv',
    "heart_disease": 'dataset/health_data1/heart-disease-dataset.csv',
    "hypertension": 'dataset/health_data1/hypertension-dataset.csv',
    "metabolic_syndrome": 'dataset/health_data1/metabolic-syndrome-dataset.csv',
    "nafld": 'dataset/health_data1/nafld1-dataset.csv',
    "obesity": 'dataset/health_data1/obesity-dataset.csv',
    "stroke": 'dataset/health_data1/stroke-dataset.csv'
}

In [29]:
# Step 2: Calculate derived features based on user input
def calculate_derived_features(user_input):
    # Extract user input
    height = user_input['height']
    weight = user_input['weight']
    age = user_input['age']
    gender = user_input['gender']
    blood_pressure = user_input['blood_pressure']
    cholesterol = user_input['cholesterol']
    blood_sugar = user_input['blood_sugar']

    # Calculate derived features
    bmi = weight / (height / 100) ** 2
    sodium = weight * 20
    if gender == 'Male':
        fat = weight * 0.15
    else:
        fat = weight * 0.25
    cholesterol_level = (bmi * 2) + (age * 0.15) + (blood_pressure * 0.05) + (blood_sugar * 0.02) + 150
    protein = weight * 0.9
    carbs = weight * 3

    # Return calculated features as a dictionary
    derived_features = {
        'bmi': bmi,
        'sodium': sodium,
        'fat': fat,
        'cholesterol': cholesterol_level,
        'protein': protein,
        'carbs': carbs
    }
    
    return derived_features

In [37]:
# Step 3: Prepare the data for model training
def prepare_data_for_model(datasets):
    features_list = []
    target_list = []

    for dataset_name, dataset in datasets.items():
        # Load the dataset
        df = pd.read_csv(dataset)
        
        # Print the column names to debug
        print(f"Columns in {dataset_name}: {df.columns}")

        # Set the target column based on dataset name
        if dataset_name == 'stroke-dataset.csv':
            if 'stroke' in df.columns:
                df['target'] = df['stroke']
            else:
                raise KeyError(f"'stroke' column not found in {dataset_name}")
        elif dataset_name == 'heart-disease-dataset.csv':
            if 'target' in df.columns:
                df['target'] = df['target']
            else:
                raise KeyError(f"'target' column not found in {dataset_name}")
        elif dataset_name == 'diabetes-dataset.csv':
            if 'Diabetes' in df.columns:
                df['target'] = df['Diabetes']
            else:
                raise KeyError(f"'Diabetes' column not found in {dataset_name}")
        elif dataset_name == 'chronic-kidney-disease-dataset.csv':
            if 'classification' in df.columns:
                df['target'] = df['classification']
            else:
                raise KeyError(f"'classification' column not found in {dataset_name}")
        elif dataset_name == 'cholesterol-dataset.csv':
            if 'num' in df.columns:
                df['target'] = df['num']
            else:
                raise KeyError(f"'num' column not found in {dataset_name}")
        elif dataset_name == 'anemia-dataset.csv':
            if 'Result' in df.columns:
                df['target'] = df['Result']
            else:
                raise KeyError(f"'Result' column not found in {dataset_name}")
        elif dataset_name == 'hypertension-dataset.csv':
            if 'target' in df.columns:
                df['target'] = df['target']
            else:
                raise KeyError(f"'target' column not found in {dataset_name}")
        elif dataset_name == 'metabolic-syndrome-dataset.csv':
            if 'MetabolicSyndrome' in df.columns:
                df['target'] = df['MetabolicSyndrome']
            else:
                raise KeyError(f"'MetabolicSyndrome' column not found in {dataset_name}")
        elif dataset_name == 'nafld1-dataset.csv':
            if 'status' in df.columns:
                df['target'] = df['status']
            else:
                raise KeyError(f"'status' column not found in {dataset_name}")
        elif dataset_name == 'obesity-dataset.csv':
            if 'Label' in df.columns:
                df['target'] = df['Label']
            else:
                raise KeyError(f"'Label' column not found in {dataset_name}")

        # Select relevant features
        features = df.drop(['target'], axis=1)
        target = df['target']

        features_list.append(features)
        target_list.append(target)

    # Combine all features and targets
    all_features = pd.concat(features_list, ignore_index=True)
    all_target = pd.concat(target_list, ignore_index=True)

    return all_features, all_target


In [38]:
# Step 4: Train the model
def train_model(data, labels):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

    # Initialize RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(classification_report(y_test, y_pred))

    # Save the model to disk
    joblib.dump(model, 'health_disease_predictor_model.pkl')

    return model

In [39]:
# Step 5: Predict the disease risk for a new user
def predict_disease_risk(user_input, model):
    # Calculate derived features for user
    derived_features = calculate_derived_features(user_input)
    
    # Create a dataframe for the model input
    user_data = pd.DataFrame([derived_features])
    
    # Predict the disease risk (percentage of potential diseases)
    prediction = model.predict(user_data)
    
    return prediction

# Example of user input
user_input = {
    'height': 170,  # in cm
    'weight': 70,  # in kg
    'age': 30,
    'gender': 'Male',
    'blood_pressure': 120,
    'cholesterol': 200,
    'blood_sugar': 90
}

# Prepare the data for model training
data, labels = prepare_data_for_model(datasets)

# Train the model
model = train_model(data, labels)

# Predict disease risk for a new user
prediction = predict_disease_risk(user_input, model)
print("Predicted disease risk:", prediction)


Columns in anemia: Index(['Gender', 'Hemoglobin', 'MCH', 'MCHC', 'MCV', 'Result'], dtype='object')


KeyError: "['target'] not found in axis"