In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, f1_score, confusion_matrix
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import joblib

# Load dataset
df = pd.read_csv("Personalized_Diet_RecommendationsDC.csv")

# Define top 15 features based on average importance
selected_features = [
    'Caloric_Intake', 'Daily_Steps', 'Cholesterol_Level', 'Carbohydrate_Intake',
    'Protein_Intake', 'Fat_Intake', 'BMI', 'Age', 'Blood_Pressure_Systolic',
    'Blood_Sugar_Level', 'Sleep_Hours', 'Weight_kg', 'Food_Aversions_Sweet',
    'Height_cm', 'Dietary_Habits_Vegan'
]
X = df[selected_features]
y = df['Recommended_Meal_Plan']

# Define numeric columns for scaling
numeric_cols = [
    'Caloric_Intake', 'Daily_Steps', 'Cholesterol_Level', 'Carbohydrate_Intake',
    'Protein_Intake', 'Fat_Intake', 'BMI', 'Age', 'Blood_Pressure_Systolic',
    'Blood_Sugar_Level', 'Sleep_Hours', 'Weight_kg', 'Height_cm'
]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Define base models with hyperparameter grids
catboost = CatBoostClassifier(verbose=0, random_state=42)
catboost_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05],
    'depth': [6, 8]
}
catboost_search = GridSearchCV(catboost, catboost_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1)

xgboost = XGBClassifier(random_state=42, eval_metric='mlogloss')
xgboost_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6]
}
xgboost_search = GridSearchCV(xgboost, xgboost_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1)

rf = RandomForestClassifier(random_state=42)
rf_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
rf_search = GridSearchCV(rf, rf_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1)

# Define stacking ensemble
estimators = [
    ('catboost', catboost_search),
    ('xgboost', xgboost_search),
    ('rf', rf_search)
]
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(multi_class='multinomial', max_iter=1000),
    cv=3
)

# Train stacking model
stacking_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = stacking_model.predict(X_test)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("Stacking Ensemble Balanced Accuracy:", balanced_acc)
print("Macro F1-Score:", macro_f1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance (average from base models)
catboost_importance = stacking_model.named_estimators_['catboost'].best_estimator_.get_feature_importance()
xgboost_importance = stacking_model.named_estimators_['xgboost'].best_estimator_.feature_importances_
rf_importance = stacking_model.named_estimators_['rf'].best_estimator_.feature_importances_
avg_importance = (catboost_importance + xgboost_importance + rf_importance) / 3
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': avg_importance
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance (Top 10):")
print(feature_importance.head(10))

# Save model and scaler
#joblib.dump(stacking_model, "stacking_classifier_model.pkl")
#joblib.dump(scaler, "scaler_stacking.pkl")
#print("\nModel saved as 'stacking_classifier_model.pkl'")

Stacking Ensemble Balanced Accuracy: 0.23027266122303724
Macro F1-Score: 0.2134778873377931

Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.27      0.25       248
           1       0.23      0.12      0.16       259
           2       0.27      0.45      0.34       282
           3       0.17      0.08      0.11       211

    accuracy                           0.24      1000
   macro avg       0.22      0.23      0.21      1000
weighted avg       0.23      0.24      0.22      1000


Confusion Matrix:
[[ 67  40 114  27]
 [ 73  32 124  30]
 [ 88  42 126  26]
 [ 70  27  97  17]]

Feature Importance (Top 10):
                    Feature  Importance
1               Daily_Steps    3.098077
2         Cholesterol_Level    2.935792
0            Caloric_Intake    2.876690
4            Protein_Intake    2.832074
8   Blood_Pressure_Systolic    2.697610
5                Fat_Intake    2.661577
9         Blood_Sugar_Level    2.656062
3  