In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, classification_report, f1_score
import lightgbm as lgb
import joblib

# Load dataset
df = pd.read_csv("Personalized_Diet_RecommendationsDC.csv")

# Define features (37) and target
features = ['Age', 'Height_cm', 'Weight_kg', 'BMI', 'Blood_Pressure_Systolic',
            'Blood_Pressure_Diastolic', 'Cholesterol_Level', 'Blood_Sugar_Level',
            'Daily_Steps', 'Exercise_Frequency', 'Sleep_Hours', 'Caloric_Intake',
            'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake', 'Gender_Female',
            'Gender_Male', 'Chronic_Disease_Diabetes', 'Chronic_Disease_Heart_Disease',
            'Chronic_Disease_Hypertension', 'Chronic_Disease_Obesity',
            'Allergies_Gluten_Intolerance', 'Allergies_Lactose_Intolerance',
            'Allergies_Nut_Allergy', 'Dietary_Habits_Keto', 'Dietary_Habits_Vegan',
            'Dietary_Habits_Vegetarian', 'Preferred_Cuisine_Asian',
            'Preferred_Cuisine_Indian', 'Preferred_Cuisine_Mediterranean',
            'Preferred_Cuisine_Western', 'Food_Aversions_Salty', 'Food_Aversions_Spicy',
            'Food_Aversions_Sweet', 'Genetic_Risk_Factor', 'Alcohol_Consumption',
            'Smoking_Habit']
X = df[features]
y = df['Recommended_Meal_Plan']

# Define numeric columns for scaling
numeric_cols = ['Age', 'Height_cm', 'Weight_kg', 'BMI', 'Blood_Pressure_Systolic',
                'Blood_Pressure_Diastolic', 'Cholesterol_Level', 'Blood_Sugar_Level',
                'Daily_Steps', 'Exercise_Frequency', 'Sleep_Hours', 'Caloric_Intake',
                'Protein_Intake', 'Carbohydrate_Intake', 'Fat_Intake']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Compute class weights
class_weights = dict(zip(
    y_train.value_counts().index,
    len(y_train) / (y_train.value_counts() * len(y_train.unique()))
))
weight_array = y_train.map(class_weights).values

# Train LightGBM
train_data = lgb.Dataset(X_train, label=y_train, weight=weight_array)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'random_state': 42
}
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)

# Predict and evaluate
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("LightGBM Balanced Accuracy:", balanced_acc)
print("Macro F1-Score:", macro_f1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importance(importance_type='gain')
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
print("\nFeature Importance (Top 10):")
print(feature_importance.head(10))

# Save model and scaler
#model.save_model("lightgbm_model.txt")
#joblib.dump(scaler, "scaler_lightgbm.pkl")
#print("\nModel saved as 'lightgbm_model.txt'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2079
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.44966
Early stopping, best iteration is:
[1]	valid_0's multi_logloss: 1.38664
LightGBM Balanced Accuracy: 0.22576149896090889
Macro F1-Score: 0.224311769336614

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.24      0.25       248
           1       0.23      0.19      0.21       259
           2       0.25      0.22      0.23       282
