In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import joblib

In [2]:
# Load dataset
df = pd.read_csv("nutrition_dataset_cleaned.csv")

# Fix duplicate goal columns
if 'Goal_Weight Maintenance' in df.columns and 'Goal_Maintenance' in df.columns:
    df['Goal_Maintenance'] = df[['Goal_Maintenance', 'Goal_Weight Maintenance']].max(axis=1)
    df = df.drop(columns=['Goal_Weight Maintenance'])

# Create new feature: BMI
df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2

# Merge low-sample categories
df['Breakfast Category'] = df['Breakfast Category'].replace(['yogurt-based', 'egg-based'], 'other')
le = LabelEncoder()
df['Breakfast Category Encoded'] = le.fit_transform(df['Breakfast Category'])

# Define features and target
X = df[['Age', 'Gender', 'Height', 'Weight', 'Activity_Lightly Active', 'Activity_Moderately Active',
        'Activity_Sedentary', 'Activity_Very Active', 'Goal_Maintenance', 'Goal_Muscle Gain',
        'Goal_Weight Loss', 'Diet_Omnivore', 'Diet_Vegan', 'Diet_Vegetarian', 'BMI']]
y = df['Breakfast Category Encoded']

# Scale numerical features
scaler = StandardScaler()
X[['Age', 'Height', 'Weight', 'BMI']] = scaler.fit_transform(X[['Age', 'Height', 'Weight', 'BMI']])

NameError: name 'LabelEncoder' is not defined

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Compute class weights
class_counts = Counter(y_train)
total_samples = sum(class_counts.values())
class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}

# Initialize and train XGBoost
xgb_model = XGBClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=6)
xgb_model.fit(X_train, y_train, sample_weight=[class_weights[y] for y in y_train])

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
print("\nXGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

# Feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': xgb_model.feature_importances_}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

In [None]:
# Save model
joblib.dump(xgb_model, "xgboost_breakfast_model.pkl")
print("\nXGBoost model saved as 'xgboost_breakfast_model.pkl'")