In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
from imblearn.over_sampling import SMOTE

In [7]:
# Load the cleaned dataset
df = pd.read_csv("nutrition_dataset_cleaned.csv")
print(df.columns)

Index(['Age', 'Gender', 'Height', 'Weight', 'Daily Calorie Target', 'Protein',
       'Carbohydrates', 'Fat', 'Breakfast Suggestion', 'Lunch Suggestion',
       'Dinner Suggestion', 'Snack Suggestion', 'Breakfast Category',
       'Activity_Lightly Active', 'Activity_Moderately Active',
       'Activity_Sedentary', 'Activity_Very Active', 'Goal_Maintenance',
       'Goal_Muscle Gain', 'Goal_Weight Loss', 'Goal_Weight Maintenance',
       'Diet_Omnivore', 'Diet_Vegan', 'Diet_Vegetarian',
       'Breakfast Category Encoded'],
      dtype='object')


In [9]:


# Define features (X) and target (y)
X = df[['Age', 'Gender', 'Height', 'Weight','Activity_Lightly Active', 'Activity_Moderately Active',
       'Activity_Sedentary', 'Activity_Very Active', 'Goal_Maintenance','Goal_Muscle Gain',
       'Goal_Weight Loss', 'Goal_Weight Maintenance','Diet_Omnivore', 'Diet_Vegan', 'Diet_Vegetarian']]
y = df["Breakfast Category Encoded"]

scaler = StandardScaler()
X[['Age', 'Height', 'Weight']] = scaler.fit_transform(X[['Age', 'Height', 'Weight']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42, k_neighbors=1)  # k_neighbors=1 برای دسته‌های خیلی کم‌تعداد
X_train, y_train = smote.fit_resample(X_train, y_train)

# Compute class weights for imbalanced classes
class_counts = Counter(y)
total_samples = sum(class_counts.values())
class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight=class_weights),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100, class_weight=class_weights),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, class_weight=class_weights),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=6, scale_pos_weight=class_weights)
}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Print results
    print(f"\nResults for {name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=df["Breakfast Category"].unique(), zero_division=0))




Results for Decision Tree:
Accuracy: 0.48
Classification Report:
               precision    recall  f1-score   support

oatmeal-based       0.31      0.29      0.30        17
   tofu-based       0.25      0.12      0.17        24
 yogurt-based       0.00      0.00      0.00         3
    egg-based       0.00      0.00      0.00         2
pancake-based       0.84      0.89      0.86        35
        other       0.33      0.47      0.39        19

     accuracy                           0.48       100
    macro avg       0.29      0.30      0.29       100
 weighted avg       0.47      0.48      0.47       100


Results for Random Forest:
Accuracy: 0.52
Classification Report:
               precision    recall  f1-score   support

oatmeal-based       0.33      0.29      0.31        17
   tofu-based       0.46      0.25      0.32        24
 yogurt-based       0.00      0.00      0.00         3
    egg-based       0.00      0.00      0.00         2
pancake-based       0.80      0.94     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Results for Logistic Regression:
Accuracy: 0.4
Classification Report:
               precision    recall  f1-score   support

oatmeal-based       0.54      0.41      0.47        17
   tofu-based       0.25      0.12      0.17        24
 yogurt-based       0.00      0.00      0.00         3
    egg-based       0.09      0.50      0.15         2
pancake-based       0.74      0.66      0.70        35
        other       0.26      0.32      0.29        19

     accuracy                           0.40       100
    macro avg       0.31      0.33      0.29       100
 weighted avg       0.46      0.40      0.42       100



Parameters: { "scale_pos_weight" } are not used.




Results for XGBoost:
Accuracy: 0.51
Classification Report:
               precision    recall  f1-score   support

oatmeal-based       0.35      0.41      0.38        17
   tofu-based       0.27      0.17      0.21        24
 yogurt-based       0.00      0.00      0.00         3
    egg-based       0.00      0.00      0.00         2
pancake-based       0.80      0.94      0.87        35
        other       0.32      0.37      0.34        19

     accuracy                           0.51       100
    macro avg       0.29      0.31      0.30       100
 weighted avg       0.47      0.51      0.48       100



In [None]:
# Save the best model (XGBoost) for later use
best_model = models["XGBoost"]
best_model.fit(X_train, y_train)  # Re-train on full training data
import joblib
joblib.dump(best_model, "best_breakfast_model.pkl")
print("\nBest model (XGBoost) saved as 'best_breakfast_model.pkl'")