In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import joblib


# Load updated dataset
data = pd.read_csv("updated_synthetic_health_data.csv")

# Prepare data for training
X = data.drop(columns=['risk'])
y = data['risk']

# Label encode categorical features
categorical_features = ['gender', 'activity_level', 'smoking_status', 'alcohol_intake']
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Label encode the target variable
risk_le = LabelEncoder()
y = risk_le.fit_transform(y)
label_encoders['risk'] = risk_le

# Scale numeric features
scaler = StandardScaler()
numeric_features = ['age', 'weight_kg', 'height_cm', 'heart_rate', 'calories']
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Save the updated model, encoders, and scaler
joblib.dump(model, "health_risk_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model Saved")
