In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install LightGBM (if not installed)
!pip install lightgbm



In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
import joblib

In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/full_dataset.csv")  # Assuming RecipeNLG CSV file

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,1
ingredients,0
directions,0
link,0
source,0
NER,0


In [None]:
# remove the row with null value in title column
df = df.dropna(subset=['title'])

In [None]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
ingredients,0
directions,0
link,0
source,0
NER,0


In [None]:
# Only keeping relevant columns
df = df[['title', 'ingredients', 'NER']].dropna()

In [None]:
df.head(10)

Unnamed: 0,title,ingredients,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."
5,Cheeseburger Potato Soup,"[""6 baking potatoes"", ""1 lb. of extra lean gro...","[""baking potatoes"", ""extra lean ground beef"", ..."
6,Rhubarb Coffee Cake,"[""1 1/2 c. sugar"", ""1/2 c. butter"", ""1 egg"", ""...","[""sugar"", ""butter"", ""egg"", ""buttermilk"", ""flou..."
7,Scalloped Corn,"[""1 can cream-style corn"", ""1 can whole kernel...","[""cream-style corn"", ""whole kernel corn"", ""cra..."
8,Nolan'S Pepper Steak,"[""1 1/2 lb. round steak (1-inch thick), cut in...","[""tomatoes"", ""water"", ""onions"", ""Worcestershir..."
9,Millionaire Pie,"[""1 large container Cool Whip"", ""1 large can c...","[""pineapple"", ""condensed milk"", ""lemons"", ""pec..."


In [None]:
# Compile the regex patterns once for each tag
def get_regex_patterns(items):
    return [re.compile(r'\b' + re.escape(item) + r'\b', re.IGNORECASE) for item in items]

# Helper function to check if any of the tags are present in the ingredients list (optimized regex matching)
def contains_any(ingredients, patterns):
    ingredients = [i.lower() for i in ingredients]  # Lowercase everything
    return any(any(pattern.search(ing) for ing in ingredients) for pattern in patterns)

# Function to assign tags based on the ingredients
def get_tags(ingredients):
    ingredients = [i.lower() for i in ingredients]  # Lowercase everything

    tags = {
        "high-protein": contains_any(ingredients, get_regex_patterns(["chicken", "beef", "tofu", "lentil", "egg", "salmon", "tuna", "protein", "chickpea", "bean", "yogurt"])),
        "low-carb": not contains_any(ingredients, get_regex_patterns(["bread", "rice", "pasta", "potato", "flour", "sugar", "corn", "tortilla", "cracker"])),
        "vegan": not contains_any(ingredients, get_regex_patterns(["chicken", "beef", "pork", "fish", "salmon", "tuna", "shrimp", "crab", "lobster", "anchovy", "meat", "egg", "milk", "cheese", "butter", "cream", "yogurt", "honey", "gelatin", "lard"])),
        "vegetarian": not contains_any(ingredients, get_regex_patterns(["chicken", "beef", "pork", "fish", "salmon", "tuna", "shrimp", "crab", "lobster", "anchovy", "meat", "gelatin", "lard", "bacon", "ham"])),
        "gluten-free": not contains_any(ingredients, get_regex_patterns(["wheat", "flour", "barley", "bread", "pasta", "rye", "malt", "couscous"])),
        "dairy-free": not contains_any(ingredients, get_regex_patterns(["milk", "cheese", "butter", "cream", "yogurt", "ghee", "condensed milk"])),
        "keto": contains_any(ingredients, get_regex_patterns(["avocado", "egg", "cheese", "chicken", "beef", "butter", "olive oil", "almond", "bacon", "salmon", "tuna"])) and not contains_any(ingredients, get_regex_patterns(["sugar", "honey", "bread", "rice", "potato", "corn", "flour", "bean"])),
        "nut-free": not contains_any(ingredients, get_regex_patterns(["almond", "cashew", "walnut", "peanut", "pecan", "hazelnut", "pistachio", "macadamia"])),
        "soy-free": not contains_any(ingredients, get_regex_patterns(["soy", "tofu", "soybean", "soy sauce", "edamame"])),
        "halal": not contains_any(ingredients, get_regex_patterns(["pork", "bacon", "ham", "gelatin"])),
        "pescatarian": not contains_any(ingredients, get_regex_patterns(["chicken", "beef", "meat"])) and contains_any(ingredients, get_regex_patterns(["fish", "salmon", "tuna", "shrimp", "crab", "lobster"])),
        "low-fat": not contains_any(ingredients, get_regex_patterns(["butter", "oil", "cream", "fat", "mayonnaise", "cheese", "bacon"])),
        "sugar-free": not contains_any(ingredients, get_regex_patterns(["sugar", "honey", "syrup", "chocolate", "maple syrup"])),
        "low-sodium": not contains_any(ingredients, get_regex_patterns(["salt", "soy sauce", "bacon", "ham", "processed cheese", "bouillon"])),
        "diabetic-friendly": not contains_any(ingredients, get_regex_patterns(["sugar", "syrup", "white bread", "pastry", "dessert"])),
        "paleo": contains_any(ingredients, get_regex_patterns(["meat", "fish", "egg", "vegetable", "fruit", "nut", "seed"])) and not contains_any(ingredients, get_regex_patterns(["bread", "pasta", "rice", "bean", "peanut", "corn", "cheese", "milk", "butter"])),
        "egg-free": not contains_any(ingredients, get_regex_patterns(["egg", "egg white", "egg yolk"])),
        "kosher": not contains_any(ingredients, get_regex_patterns(["pork", "bacon", "ham", "shrimp", "crab", "lobster", "shellfish"])),
        "high-fiber": contains_any(ingredients, get_regex_patterns(["oat", "whole grain", "bran", "lentil", "bean", "broccoli", "spinach", "apple", "banana", "chickpea"])),
    }

    return [tag for tag, value in tags.items() if value]

# Only keep relevant columns
df = df[['title', 'ingredients', 'NER']].dropna()

# Apply the get_tags function to the 'ingredients' column
df['tags'] = df['ingredients'].apply(lambda x: get_tags(x.split(',')))

# Done!


In [None]:
df[['title','tags']].head(10)

Unnamed: 0,title,tags
0,No-Bake Nut Cookies,"[vegetarian, gluten-free, nut-free, soy-free, ..."
1,Jewell Ball'S Chicken,"[high-protein, low-carb, gluten-free, keto, nu..."
2,Creamy Corn,"[vegetarian, gluten-free, nut-free, soy-free, ..."
3,Chicken Funny,"[high-protein, low-carb, gluten-free, keto, nu..."
4,Reeses Cups(Candy),"[vegetarian, gluten-free, soy-free, halal, low..."
5,Cheeseburger Potato Soup,"[high-protein, low-carb, gluten-free, keto, nu..."
6,Rhubarb Coffee Cake,"[high-protein, vegetarian, nut-free, soy-free,..."
7,Scalloped Corn,"[high-protein, vegetarian, gluten-free, nut-fr..."
8,Nolan'S Pepper Steak,"[low-carb, vegan, vegetarian, gluten-free, dai..."
9,Millionaire Pie,"[vegetarian, gluten-free, nut-free, soy-free, ..."


In [None]:
df[df['tags'].map(len) > 0].head(10)

Unnamed: 0,title,ingredients,NER,tags
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[vegetarian, gluten-free, nut-free, soy-free, ..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""beef"", ""chicken breasts"", ""cream of mushroom...","[high-protein, low-carb, gluten-free, keto, nu..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[vegetarian, gluten-free, nut-free, soy-free, ..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""chicken"", ""chicken gravy"", ""cream of mushroo...","[high-protein, low-carb, gluten-free, keto, nu..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""peanut butter"", ""graham cracker crumbs"", ""bu...","[vegetarian, gluten-free, soy-free, halal, low..."
5,Cheeseburger Potato Soup,"[""6 baking potatoes"", ""1 lb. of extra lean gro...","[""baking potatoes"", ""extra lean ground beef"", ...","[high-protein, low-carb, gluten-free, keto, nu..."
6,Rhubarb Coffee Cake,"[""1 1/2 c. sugar"", ""1/2 c. butter"", ""1 egg"", ""...","[""sugar"", ""butter"", ""egg"", ""buttermilk"", ""flou...","[high-protein, vegetarian, nut-free, soy-free,..."
7,Scalloped Corn,"[""1 can cream-style corn"", ""1 can whole kernel...","[""cream-style corn"", ""whole kernel corn"", ""cra...","[high-protein, vegetarian, gluten-free, nut-fr..."
8,Nolan'S Pepper Steak,"[""1 1/2 lb. round steak (1-inch thick), cut in...","[""tomatoes"", ""water"", ""onions"", ""Worcestershir...","[low-carb, vegan, vegetarian, gluten-free, dai..."
9,Millionaire Pie,"[""1 large container Cool Whip"", ""1 large can c...","[""pineapple"", ""condensed milk"", ""lemons"", ""pec...","[vegetarian, gluten-free, nut-free, soy-free, ..."


In [None]:
df = df[df['tags'].map(len) > 0]  # Remove rows with no tags
df.head()

Unnamed: 0,title,ingredients,NER,tags
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[vegetarian, gluten-free, nut-free, soy-free, ..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""beef"", ""chicken breasts"", ""cream of mushroom...","[high-protein, low-carb, gluten-free, keto, nu..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[vegetarian, gluten-free, nut-free, soy-free, ..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""chicken"", ""chicken gravy"", ""cream of mushroo...","[high-protein, low-carb, gluten-free, keto, nu..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""peanut butter"", ""graham cracker crumbs"", ""bu...","[vegetarian, gluten-free, soy-free, halal, low..."


In [None]:
# Preprocessing Tags
diet_tags = [
    'vegan', 'vegetarian', 'gluten-free', 'low-carb', 'keto', 'paleo',
    'high-protein', 'low-fat', 'dairy-free', 'nut-free', 'soy-free',
    'egg-free', 'sugar-free', 'halal', 'kosher', 'high-fiber','pescatarian','diabetic-friendly','low-sodium'
]

In [None]:
# Create multi-label binary columns
for tag in diet_tags:
    df.loc[:, tag] = df['tags'].apply(lambda x: 1 if tag in x else 0)

In [None]:
# Features and Labels
df['text'] = df['title'] + ' ' + df['ingredients']
X = df['text']
y = df[diet_tags]

In [None]:
# TF-IDF Vectorization (reduced features)
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(X)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# LightGBM Model (with good fixed parameters)
lgbm_base = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    max_depth=5,
    random_state=42,
    n_jobs=-1  # use all CPU cores
)

In [None]:
model = MultiOutputClassifier(lgbm_base)

In [None]:
# Fit model
model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 416409, number of negative: 1368503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 21.615614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233294 -> initscore=-1.189805
[LightGBM] [Info] Start training from score -1.189805




[LightGBM] [Info] Number of positive: 1218581, number of negative: 566331
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 18.963684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.682712 -> initscore=0.766264
[LightGBM] [Info] Start training from score 0.766264




[LightGBM] [Info] Number of positive: 1188393, number of negative: 596519
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 18.482398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.665799 -> initscore=0.689246
[LightGBM] [Info] Start training from score 0.689246




[LightGBM] [Info] Number of positive: 690784, number of negative: 1094128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 18.594096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.387013 -> initscore=-0.459886
[LightGBM] [Info] Start training from score -0.459886




[LightGBM] [Info] Number of positive: 486143, number of negative: 1298769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 18.822222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.272362 -> initscore=-0.982669
[LightGBM] [Info] Start training from score -0.982669




[LightGBM] [Info] Number of positive: 140428, number of negative: 1644484
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.677310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078675 -> initscore=-2.460487
[LightGBM] [Info] Start training from score -2.460487




[LightGBM] [Info] Number of positive: 604435, number of negative: 1180477
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 19.545497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.338636 -> initscore=-0.669380
[LightGBM] [Info] Start training from score -0.669380




[LightGBM] [Info] Number of positive: 468377, number of negative: 1316535
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 19.519617 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.262409 -> initscore=-1.033485
[LightGBM] [Info] Start training from score -1.033485




[LightGBM] [Info] Number of positive: 694048, number of negative: 1090864
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.143172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388842 -> initscore=-0.452184
[LightGBM] [Info] Start training from score -0.452184




[LightGBM] [Info] Number of positive: 1684115, number of negative: 100797
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 20.622430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.943528 -> initscore=2.815887
[LightGBM] [Info] Start training from score 2.815887




[LightGBM] [Info] Number of positive: 1709530, number of negative: 75382
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.774830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.957767 -> initscore=3.121405
[LightGBM] [Info] Start training from score 3.121405




[LightGBM] [Info] Number of positive: 1572626, number of negative: 212286
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.936490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.881066 -> initscore=2.002568
[LightGBM] [Info] Start training from score 2.002568




[LightGBM] [Info] Number of positive: 1002745, number of negative: 782167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.808619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.561790 -> initscore=0.248428
[LightGBM] [Info] Start training from score 0.248428




[LightGBM] [Info] Number of positive: 1615739, number of negative: 169173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 19.831159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.905221 -> initscore=2.256626
[LightGBM] [Info] Start training from score 2.256626




[LightGBM] [Info] Number of positive: 1593352, number of negative: 191560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.011059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.892678 -> initscore=2.118394
[LightGBM] [Info] Start training from score 2.118394




[LightGBM] [Info] Number of positive: 163419, number of negative: 1621493
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 19.337560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.091556 -> initscore=-2.294785
[LightGBM] [Info] Start training from score -2.294785




[LightGBM] [Info] Number of positive: 67454, number of negative: 1717458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 19.329961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037791 -> initscore=-3.237155
[LightGBM] [Info] Start training from score -3.237155




[LightGBM] [Info] Number of positive: 1061338, number of negative: 723574
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 21.952374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.594616 -> initscore=0.383083
[LightGBM] [Info] Start training from score 0.383083




[LightGBM] [Info] Number of positive: 820371, number of negative: 964541
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 20.885886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 239822
[LightGBM] [Info] Number of data points in the train set: 1784912, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.459614 -> initscore=-0.161896
[LightGBM] [Info] Start training from score -0.161896


In [None]:
# Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=diet_tags))



                   precision    recall  f1-score   support

            vegan       0.99      0.97      0.98    103703
       vegetarian       1.00      0.99      0.99    304415
      gluten-free       1.00      0.99      1.00    297491
         low-carb       1.00      0.98      0.99    172821
             keto       0.98      0.99      0.98    121936
            paleo       0.92      0.95      0.94     35477
     high-protein       0.98      0.99      0.99    151501
          low-fat       1.00      0.98      0.99    116625
       dairy-free       1.00      0.99      1.00    173046
         nut-free       1.00      1.00      1.00    421017
         soy-free       1.00      1.00      1.00    427121
         egg-free       1.00      1.00      1.00    392903
       sugar-free       1.00      1.00      1.00    251218
            halal       1.00      1.00      1.00    404075
           kosher       1.00      1.00      1.00    398426
       high-fiber       0.98      0.96      0.97     40

In [None]:
# Save model and vectorizer
joblib.dump(model, "best_diet_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [None]:
# Real-life Test Scenario
def predict_tags(dish_description):
    model = joblib.load("best_diet_model.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    X_input = vectorizer.transform([dish_description])
    pred = model.predict(X_input)[0]
    result_tags = [tag for tag, val in zip(diet_tags, pred) if val == 1]
    if result_tags:
        return "Predicted tags: " + ", ".join(result_tags)
    else:
        return "No specific diet category matched."

# Example Test
example_dish = "Lentils with olive oil, spinach, and avocado"
print(predict_tags(example_dish))




Predicted tags: vegan, vegetarian, gluten-free, low-carb, keto, dairy-free, nut-free, soy-free, egg-free, sugar-free, halal, kosher, high-fiber, diabetic-friendly, low-sodium


In [None]:
# New sample
new_dish = {
    'title': 'Galaxy Smoothie Bowl',
    'ingredients': 'banana, blue spirulina, coconut milk, chia seeds, dragonfruit, blueberries'
}

# If your model was trained on "ingredients" column only
new_text = new_dish['ingredients']
vectorizer = joblib.load("tfidf_vectorizer.pkl")
# Apply the **exact same preprocessing** as training
# Example if you used TF-IDF vectorizer:
new_text_transformed = vectorizer.transform([new_text])

# Predict tags
predicted_tags = model.predict(new_text_transformed)

# (If you need, threshold probabilities)
# For example:
import numpy as np
predicted_tags = (predicted_tags > 0.5).astype(int)  # If it's probability outputs

# Assuming you have a list of label names
labels = diet_tags

# Decode predicted tags
predicted_labels = [label for label, tag in zip(labels, predicted_tags[0]) if tag == 1]
print(predicted_labels)


['vegetarian', 'gluten-free', 'low-carb', 'low-fat', 'nut-free', 'soy-free', 'egg-free', 'sugar-free', 'halal', 'kosher', 'high-fiber', 'diabetic-friendly', 'low-sodium']


