In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
df = pd.read_csv("/health_risk.csv")  # Replace with your actual CSV file name

# Step 2: Display the first few rows
print("Sample Data:")
print(df.head())

# Step 3: Check for missing values (optional)
print("\nMissing values:\n", df.isnull().sum())
df = df.dropna()  # Drop any rows with missing values (or fill if needed)

# Step 4: Encode the target column (risk_level)
le_risk = LabelEncoder()
df['risk_level_encoded'] = le_risk.fit_transform(df['risk_level'])

# Step 5: Define features (X) and target (y)
X = df[['bmi', 'exercise_hours', 'junk_food_freq']]
y = df['risk_level_encoded']

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 7: Train the model using Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 8: Make predictions
y_pred = model.predict(X_test)

# Step 9: Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_risk.classes_))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Step 10 (Optional): Predict risk for a new person
# Example: A person with BMI=29, exercises 5 hours/week, eats junk 2 times/week
new_input = pd.DataFrame({
    'bmi': [29],
    'exercise_hours': [5],
    'junk_food_freq': [2]
})

predicted_risk = model.predict(new_input)
print("\nPredicted Risk Level for new input:", le_risk.inverse_transform(predicted_risk)[0])


Sample Data:
         bmi  exercise_hours  junk_food_freq risk_level
0  28.730279              13               1       high
1  31.301442              12               4     medium
2  32.549043               9               0     medium
3  30.463670               2               1     medium
4  28.431755               2               1        low

Missing values:
 bmi               0
exercise_hours    0
junk_food_freq    0
risk_level        0
dtype: int64

Classification Report:
              precision    recall  f1-score   support

        high       0.20      0.20      0.20         5
         low       0.14      0.20      0.17         5
      medium       0.62      0.50      0.56        10

    accuracy                           0.35        20
   macro avg       0.32      0.30      0.31        20
weighted avg       0.40      0.35      0.37        20

Accuracy Score: 0.35

Predicted Risk Level for new input: medium
