In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [3]:


# Step 1: Load the Dataset
data = pd.read_csv('synthetic_dish_data.csv')

# Step 2: Data Cleaning
# Dropping duplicates
data_cleaned = data.drop_duplicates()

# Dropping rows with missing critical values
data_cleaned = data_cleaned.dropna(subset=['Dish', 'Dieting_Level', 'Spice_Level', 'Time_of_Day', 'Age_Group', 'Preferred_Cuisine'])

# Step 3: Feature Encoding
label_encoder = LabelEncoder()

data_cleaned['Dieting_Level'] = label_encoder.fit_transform(data_cleaned['Dieting_Level'])
data_cleaned['Spice_Level'] = label_encoder.fit_transform(data_cleaned['Spice_Level'])
data_cleaned['Time_of_Day'] = label_encoder.fit_transform(data_cleaned['Time_of_Day'])
data_cleaned['Age_Group'] = label_encoder.fit_transform(data_cleaned['Age_Group'])
data_cleaned['Preferred_Cuisine'] = label_encoder.fit_transform(data_cleaned['Preferred_Cuisine'])
data_cleaned['Dish'] = label_encoder.fit_transform(data_cleaned['Dish'])

# Step 4: Splitting data into features (X) and target (y)
X = data_cleaned[['Dieting_Level', 'Spice_Level', 'Time_of_Day', 'Age_Group', 'Preferred_Cuisine']]
y = data_cleaned['Dish']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Testing Multiple Classifiers

# Logistic Regression
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Support Vector Classifier (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Print the accuracies of each model
print(f"Logistic Regression Accuracy: {accuracy_logistic * 100:.2f}%")
print(f"K-Nearest Neighbors Accuracy: {accuracy_knn * 100:.2f}%")
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Logistic Regression Accuracy: 53.92%
K-Nearest Neighbors Accuracy: 38.24%
SVM Accuracy: 54.90%
Random Forest Accuracy: 35.29%
XGBoost Accuracy: 40.20%


### The accuracy results you're getting indicate that the models might not be learning the patterns effectively from the current feature set, or the dataset might need further refinement. Here are a few steps you can take to improve the model performance:

#### 1. Feature Engineering
Add more relevant features or refine existing ones. For example, you could create new features based on existing data, such as interaction terms (combinations of Dieting_Level and Spice_Level).
Normalize or scale features if they vary widely in range.
#### 2. Handling Imbalanced Data
If the target variable (Dish) is imbalanced, some models may perform poorly. Use techniques like oversampling, undersampling, or SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset.
#### 3. Model Hyperparameter Tuning
Use techniques like GridSearchCV or RandomizedSearchCV to optimize hyperparameters of each model. For instance:
K-Nearest Neighbors: Tune the number of neighbors.
SVM: Tune the regularization parameter C and kernel type.
Random Forest: Tune the number of estimators and max depth.
XGBoost: Tune learning rate, depth, and estimators.
#### 4. Cross-Validation
Implement cross-validation to get a more robust estimate of the model’s performance and reduce variance due to random splits in train/test data.
#### 5. Examine Feature Importance
For models like Random Forest and XGBoost, you can plot the feature importance to see which features are most impactful. You may also try removing less important features.
#### 6. Data Augmentation or Enrichment
If you feel the dataset might be lacking, consider gathering more data or performing data augmentation if possible.

### The dataset contains 1,000 entries with the following six columns, all of which are categorical:

Dish – Target variable (the type of dish).
Dieting_Level – Level of dieting (Low, Medium, High).
Spice_Level – Spice preference (Low, Medium, Spicy).
Time_of_Day – Time of meal (Breakfast, Lunch, Dinner).
Age_Group – Age group of the person (Child, Adult, Senior).
Preferred_Cuisine – Preferred cuisine (Indian, Chinese, Italian, etc.).
Let's proceed with the following steps:

Feature Engineering: Explore interactions between features and scaling if necessary.
Handling Imbalanced Data: Check for imbalance in the target (Dish) and consider techniques if needed.
Hyperparameter Tuning: Implement GridSearchCV for model tuning.
Cross-Validation: Use cross-validation for a better performance estimate.
Feature Importance: Analyze the importance of features using Random Forest and XGBoost.
Data Augmentation/Enrichment: Explore ways to increase dataset richness if necessary.

In [4]:
# Adding interaction terms
data['Diet_Spice_Interaction'] = data['Dieting_Level'] + '_' + data['Spice_Level']
data['Time_Cuisine_Interaction'] = data['Time_of_Day'] + '_' + data['Preferred_Cuisine']


In [5]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Initialize label encoder
label_encoder = LabelEncoder()

# Encoding categorical features
data_encoded = data.copy()
categorical_columns = ['Dieting_Level', 'Spice_Level', 'Time_of_Day', 'Age_Group', 'Preferred_Cuisine', 'Diet_Spice_Interaction', 'Time_Cuisine_Interaction']

for col in categorical_columns:
    data_encoded[col] = label_encoder.fit_transform(data_encoded[col])

# Splitting data into features and target variable
X = data_encoded.drop(columns=['Dish'])
y = label_encoder.fit_transform(data_encoded['Dish'])

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)



In [6]:
from sklearn.model_selection import GridSearchCV

# Example for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_


In [7]:
from sklearn.model_selection import cross_val_score

# Example for Logistic Regression
logistic_model = LogisticRegression(max_iter=200)
cv_scores = cross_val_score(logistic_model, X_resampled, y_resampled, cv=5)

print(f"Cross-validated scores: {cv_scores}")
print(f"Mean accuracy: {cv_scores.mean()}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validated scores: [0.63461538 0.62019231 0.68269231 0.67307692 0.71875   ]
Mean accuracy: 0.6658653846153847


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Random Forest feature importance
rf_model.fit(X_resampled, y_resampled)
importances = rf_model.feature_importances_
feature_names = X_resampled.columns
sorted_importances = sorted(zip(importances, feature_names), reverse=True)

# Display feature importance
for importance, name in sorted_importances:
    print(f"{name}: {importance:.4f}")


Time_Cuisine_Interaction: 0.3471
Preferred_Cuisine: 0.2736
Time_of_Day: 0.1468
Age_Group: 0.0849
Diet_Spice_Interaction: 0.0751
Spice_Level: 0.0430
Dieting_Level: 0.0294


### Train and Evaluate Models
Now that your data is balanced, you can train and evaluate different machine learning models to compare their performance. Below is a guide to proceed:

Split the Data: You should split the resampled dataset into training and testing sets.
Train Multiple Classifiers: Use models like Logistic Regression, K-Nearest Neighbors, Random Forest, SVM, and XGBoost.
Evaluate Performance: Compare the accuracy of each model.
Here’s the code to implement these steps:

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Step 1: Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 2: Train and evaluate multiple classifiers

# Logistic Regression
logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Support Vector Classifier (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Step 3: Print the accuracies of each model
print(f"Logistic Regression Accuracy: {accuracy_logistic * 100:.2f}%")
print(f"K-Nearest Neighbors Accuracy: {accuracy_knn * 100:.2f}%")
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Logistic Regression Accuracy: 67.79%
K-Nearest Neighbors Accuracy: 69.71%
SVM Accuracy: 64.18%
Random Forest Accuracy: 71.88%
XGBoost Accuracy: 73.80%


In [13]:
# Sample 1 with 7 features (Dieting_Level, Spice_Level, Time_of_Day, Age_Group, Preferred_Cuisine, Diet_Spice_Interaction, Time_Cuisine_Interaction)
sample_1 = [[1, 2, 1, 0, 3, 2, 3]]

# Sample 2 with 7 features
sample_2 = [[0, 3, 2, 1, 0, 3, 1]]

# Sample 3 with 7 features
sample_3 = [[2, 1, 0, 2, 4, 1, 2]]

# Assuming you have the original LabelEncoder instance for 'Dish'
# If you've already defined it as 'label_encoder' for 'Dish', you can use that

# Predictions
pred_1 = xgb_model.predict(sample_1)
pred_2 = xgb_model.predict(sample_2)
pred_3 = xgb_model.predict(sample_3)

# Convert the encoded labels back to dish names
dish_name_1 = label_encoder.inverse_transform(pred_1)
dish_name_2 = label_encoder.inverse_transform(pred_2)
dish_name_3 = label_encoder.inverse_transform(pred_3)

# Output the dish names
print(f"Prediction for Sample 1: {dish_name_1[0]}")
print(f"Prediction for Sample 2: {dish_name_2[0]}")
print(f"Prediction for Sample 3: {dish_name_3[0]}")


Prediction for Sample 1: Frittata
Prediction for Sample 2: Burger
Prediction for Sample 3: Tacos
