In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("GymAndDietRecommendationCleaned.csv")

# Prepare features and target
X = df[['Sex', 'Hypertension', 'Diabetes', 'Level', 'Fitness Goal', 'Fitness Type', 'Age', 'Height', 'Weight', 'BMI']]
y = df['Diet_Label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42, k_neighbors=1)  # Reduced k_neighbors for class 3 (5 instances)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define Random Forest with GridSearchCV
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

In [3]:
grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_rf.fit(X_train_res, y_train_res)

# Best model
best_rf = grid_rf.best_estimator_
rf_pred = best_rf.predict(X_test)

# Evaluate
print("Random Forest Results for Diet_Label (SMOTE + Tuning):")
print("Best Parameters:", grid_rf.best_params_)
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_pred, zero_division=0))

# Feature importance
rf_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_rf.feature_importances_}).sort_values(by='Importance', ascending=False)
print("\nRandom Forest Feature Importance:\n", rf_importance)

KeyboardInterrupt: 