In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import ADASYN
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load dataset
df = pd.read_csv("nutrition_dataset_cleaned.csv")

# Fix duplicate goal columns
if 'Goal_Weight Maintenance' in df.columns and 'Goal_Maintenance' in df.columns:
    df['Goal_Maintenance'] = df[['Goal_Maintenance', 'Goal_Weight Maintenance']].max(axis=1)
    df = df.drop(columns=['Goal_Weight Maintenance'])

# Create new feature: BMI
df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2

# Merge low-sample categories
df['Breakfast Category'] = df['Breakfast Category'].replace(['yogurt-based', 'egg-based', 'pancake-based'], 'other')
le = LabelEncoder()
df['Breakfast Category Encoded'] = le.fit_transform(df['Breakfast Category'])

# Define features and target
X = df[['Age', 'Height', 'BMI', 'Diet_Omnivore', 'Diet_Vegan', 'Diet_Vegetarian', 'Goal_Muscle Gain', 'Goal_Weight Loss']]
y = df['Breakfast Category Encoded']

# Scale numerical features
scaler = StandardScaler()
X[['Age', 'Height', 'BMI']] = scaler.fit_transform(X[['Age', 'Height', 'BMI']])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply ADASYN
adasyn = ADASYN(random_state=42, sampling_strategy='auto')
X_train, y_train = adasyn.fit_resample(X_train, y_train)

# Initialize LightGBM
lgbm_model = LGBMClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 20, 31],
    'min_child_samples': [10, 20]
}
grid_search = GridSearchCV(lgbm_model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_lgbm = grid_search.best_estimator_
print("\nBest LightGBM Parameters:", grid_search.best_params_)

# Predict and evaluate
y_pred = best_lgbm.predict(X_test)
print("\nLightGBM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

# Feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_lgbm.feature_importances_}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Age', 'Height', 'BMI']] = scaler.fit_transform(X[['Age', 'Height', 'BMI']])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the train set: 159, number of used features: 8
[LightGBM] [Info] Start training from score -1.542544
[LightGBM] [Info] Start training from score -1.431318
[LightGBM] [Info] Start training from score -0.602996

Best LightGBM Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'num_leaves': 20}

LightGBM Results:
Accuracy: 0.66
Classification Report:
               precision    recall  f1-score   support

oatmeal-based       0.39      0.38      0.38        24
        other       0.72      0.63      0.68        41
   tofu-based       0.76      0.89      0.82        35

     accuracy                           0.66       100
    macro avg       0.62      0.63      0.62       

In [None]:
# Save model
joblib.dump(best_lgbm, "lightgbm_optimized_breakfast_model.pkl")
print("\nLightGBM model saved as 'lightgbm_optimized_breakfast_model.pkl'")