In [88]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, hamming_loss


In [90]:
# Load the dataset
df = pd.read_csv("open_food_facts_india_300.csv")

# Create a copy and handle missing values
df2 = df.copy()
df2['Ingredients'].fillna("", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['Ingredients'].fillna("", inplace=True)


In [94]:
print(df2.shape)
print(df2)

(300, 14)
           Barcode                            Product Name            Brand  \
0    8901491103800                                    Oats           Quaker   
1    8906112580183                             KIMEA DATES              AMS   
2    8901030900303  International Mexican Tomato Corn Soup            Knorr   
3    8906020580084        id Organic Rice Rava Idly Batter               id   
4    8906117361008                Country Delight Cow Milk  Country Delight   
..             ...                                     ...              ...   
295  8906002006113                          Veg Mayonnaise       Dr. Oetker   
296  8904063200051                    Haldiram Bhujia 200g       Haldiram's   
297  8906017290125                                  Vedica          Bisleri   
298  8906029790033                                 Snakker        Priyagold   
299  8906006722194                           Arabian dates             Lion   

                                         

In [98]:
allergen_synonyms = {
    "milk": [
        "casein", "whey", "lactose", "lactalbumin", "curds", "ghee", "cream",
        "butter", "cheese", "yogurt", "kefir", "caseinate", "rennet casein"
    ],
    "eggs": [
        "albumin", "egg white", "egg yolk", "globulin", "livetin", "lysozyme",
        "ovoglobulin", "ovalbumin", "ovomucoid", "ovovitellin", "silici albuminate"
    ],
    "peanuts": [
        "groundnut", "arachis oil", "monkey nut", "goober", "earthnut",
        "beer nuts", "peanut flour", "peanut protein"
    ],
    "tree nuts": [
        "almond", "brazil nut", "cashew", "chestnut", "hazelnut", "macadamia",
        "pecan", "pine nut", "pistachio", "walnut", "nut butters", "nut meal",
        "nut paste"
    ],
    "fish": [
        "anchovy", "bass", "catfish", "cod", "flounder", "grouper", "haddock",
        "hake", "halibut", "herring", "mackerel", "mahi mahi", "perch", "pike",
        "pollock", "salmon", "sardine", "snapper", "sole", "swordfish", "tilapia",
        "trout", "tuna", "fish sauce", "fish oil", "fish gelatin"
    ],
    "shellfish": [
        "crab", "crayfish", "lobster", "prawns", "shrimp", "clams", "cockle",
        "cuttlefish", "limpet", "mussels", "octopus", "oysters", "scallops",
        "snails", "squid", "whelk", "periwinkle", "barnacle"
    ],
    "soy": [
        "soybean", "edamame", "miso", "natto", "shoyu", "soya", "soy sauce",
        "tamari", "textured vegetable protein (TVP)", "tofu", "yuba", "soy lecithin",
        "hydrolyzed soy protein"
    ],
    "wheat": [
    "wheat", "whole wheat", "wholemeal", "bread flour", "bulgur", "couscous",
    "cracker meal", "durum", "einkorn", "emmer", "farina", "farro", "graham flour",
    "kamut", "matzo", "matza", "matzah", "matzoh", "seitan", "semolina", "spelt",
    "triticale", "atta", "maida", "refined wheat flour", "whole grain flour"
    ],
    "gluten": [
        "wheat", "barley", "rye", "malt", "triticale", "spelt", "semolina",
        "einkorn", "emmer", "farro", "kamut", "gluten", "vital wheat gluten"
    ],
    "sesame": [
        "benne", "benne seed", "gingelly", "sesame flour", "sesame oil", "sesame paste",
        "sesame seed", "tahini", "til"
    ],
    "mustard": [
        "mustard seed", "mustard flour", "mustard oil", "mustard greens",
        "yellow mustard", "brown mustard", "black mustard"
    ],
    "celery": [
        "celery stalk", "celery seed", "celery root", "celeriac"
    ],
    "sulfites": [
        "sulfur dioxide", "potassium bisulfite", "potassium metabisulfite",
        "sodium bisulfite", "sodium metabisulfite", "sodium sulfite"
    ],
    "lupin": [
        "lupine", "lupin flour", "lupin seed", "lupinus"
    ],
    "mollusks": [
        "clam", "cockle", "cuttlefish", "limpet", "mussels", "octopus", "oyster",
        "periwinkle", "scallop", "snail", "squid", "whelk"
    ]
}

common_allergens = list(allergen_synonyms.keys())


In [100]:
def clean_ingredients(ingredients):
    ingredients = ingredients.lower()
    ingredients = re.sub(r'[^\w\s]', ' ', ingredients)
    ingredients = re.sub(r'\s+', ' ', ingredients).strip()
    return ingredients

def detect_allergens(ingredients_text):
    detected = {}
    ingredients_lower = ingredients_text.lower()
    for allergen, synonyms in allergen_synonyms.items():
        found = False
        if allergen in ingredients_lower:
            found = True
        else:
            for synonym in synonyms:
                if synonym in ingredients_lower:
                    found = True
                    break
        detected[allergen] = int(found)
    return pd.Series(detected)


In [108]:
# Clean Ingredients column
df2['Ingredients'] = df2['Ingredients'].apply(clean_ingredients)

# Apply allergen detection - FIXED!
df2[common_allergens] = df2['Ingredients'].apply(detect_allergens)

# Create a binary column for presence of any allergen
df2['Allergen_Present'] = df2[common_allergens].max(axis=1)


In [110]:
# TF-IDF Vectorizer for text data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df2['Ingredients'])

# Targets for classification
y_binary = df2['Allergen_Present']
y_multilabel = df2[common_allergens]


In [112]:
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)
_, _, y_train_multilabel, y_test_multilabel = train_test_split(X, y_multilabel, test_size=0.2, random_state=42)


In [114]:
# Binary Classification Model
rf_binary = RandomForestClassifier(random_state=42)
rf_binary.fit(X_train, y_train_binary)

# Multi-Label Classification Model
rf_multi_label = MultiOutputClassifier(RandomForestClassifier(random_state=42))
rf_multi_label.fit(X_train, y_train_multilabel)


In [116]:
# Binary Model Evaluation
y_pred_binary = rf_binary.predict(X_test)
binary_accuracy = accuracy_score(y_test_binary, y_pred_binary)
print(f"✅ Binary Classification Accuracy (Any Allergen): {binary_accuracy * 100:.2f}%")

# Multi-Label Model Evaluation
y_pred_multilabel = rf_multi_label.predict(X_test)
multilabel_accuracy = 1 - hamming_loss(y_test_multilabel, y_pred_multilabel)
print(f"✅ Multi-Label Classification Accuracy (Specific Allergens): {multilabel_accuracy * 100:.2f}%")


✅ Binary Classification Accuracy (Any Allergen): 91.67%
✅ Multi-Label Classification Accuracy (Specific Allergens): 94.56%


In [118]:
def improved_allergens_detection(ingredient_text):
    cleaned_ingredient = clean_ingredients(ingredient_text)

    # Rule-based detection
    detected_by_synonyms = detect_allergens(cleaned_ingredient)
    allergens_from_synonyms = {allergen for allergen, present in detected_by_synonyms.items() if present}

    # Model-based detection
    input_vectorized = vectorizer.transform([cleaned_ingredient])
    specific_allergens_pred = rf_multi_label.predict(input_vectorized)
    allergens_from_model = {allergen for allergen, present in zip(common_allergens, specific_allergens_pred[0]) if present}

    # Merge both detections
    combined_allergens = sorted(allergens_from_synonyms.union(allergens_from_model))

    if not combined_allergens:
        return "No allergens present."
    return f"Allergens present: {', '.join(combined_allergens)}"


In [120]:
test_sample = {
    "Ingredients": "OATS (16.9%), SUGAR, WHEAT FLAKES (14.3%), CORN GRITS (10.5%), DRIED PAPAYA, SOYA LECITHIN, ALMONDS"
}

print("\n🔍 Test Sample Prediction:")
print(improved_allergens_detection(test_sample['Ingredients']))



🔍 Test Sample Prediction:
Allergens present: gluten, soy, tree nuts, wheat


In [55]:
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)
_, _, y_train_multilabel, y_test_multilabel = train_test_split(X, y_multilabel, test_size=0.2, random_state=42)

In [57]:
rf_binary = RandomForestClassifier(random_state=42)
rf_binary.fit(X_train, y_train_binary)

In [59]:
y_pred_binary = rf_binary.predict(X_test)
binary_accuracy = accuracy_score(y_test_binary, y_pred_binary)
print(f"Binary Classification Accuracy: {binary_accuracy * 100:.2f}%")

Binary Classification Accuracy: 94.12%


In [60]:
rf_multi_label = MultiOutputClassifier(RandomForestClassifier(random_state=42))
rf_multi_label.fit(X_train, y_train_multilabel)

In [61]:
y_pred_multilabel = rf_multi_label.predict(X_test)
multilabel_accuracy = 1 - hamming_loss(y_test_multilabel, y_pred_multilabel)
print(f"Multi-Label Classification Accuracy: {multilabel_accuracy * 100:.2f}%")

Multi-Label Classification Accuracy: 94.64%


In [63]:
def allergens_detection(input_data):
    # Vectorize the input ingredients
    input_ingredients = [input_data['Ingredients']]
    input_vectorized = vectorizer.transform(input_ingredients)

    # Predict if any allergen is present
    allergen_present = rf_binary.predict(input_vectorized)[0]

    if allergen_present == 0:
        return "No allergens present."

    # Predict specific allergens
    specific_allergens = rf_multi_label.predict(input_vectorized)
    detected_allergens = [
        allergen for allergen, present in zip(common_allergens, specific_allergens[0]) if present
    ]

    return f"Allergens present: {', '.join(detected_allergens)}" if detected_allergens else "No specific allergens identified."


In [64]:
import random

# Step 1: Pick a random index from test data
random_idx = random.randint(0, X_test.shape[0] - 1)

# Step 2: Get the vectorized data
sample_vectorized = X_test[random_idx]

# Step 3: Reverse map to get the raw ingredient text (trick: we use the indices)
# Get the corresponding ingredient text
# This works because train_test_split preserves order for y_train_multilabel
original_ingredients = df2.iloc[y_test_multilabel.index[random_idx]]['Ingredients']

# Step 4: Create a test input dictionary for the allergens_detection function
test_input = {"Ingredients": original_ingredients}

# Step 5: Run detection
predicted_result = allergens_detection(test_input)

# Step 6: Show the actual allergens from the test data
actual_allergens = [
    allergen for allergen, present in zip(common_allergens, y_test_multilabel.iloc[random_idx]) if present
]

# Step 7: Output comparison
print("📝 Ingredients:")
print(original_ingredients)
print("\n✅ Predicted Allergens:")
print(predicted_result)
print("\n🎯 Actual Allergens in Dataset:")
print(", ".join(actual_allergens) if actual_allergens else "No allergens listed")


📝 Ingredients:
No ingredients available

✅ Predicted Allergens:
No allergens present.

🎯 Actual Allergens in Dataset:
No allergens listed


In [125]:
import random

# Run predictions on multiple random samples (change number as needed)
for i in range(5):
    # Random index from the test set
    random_idx = random.randint(0, X_test.shape[0] - 1)
    
    # Get the original ingredients text from df2 using index from y_test_multilabel
    sample_index = y_test_multilabel.index[random_idx]
    original_ingredients = df2.loc[sample_index, 'Ingredients']
    
    # Create input for the detection function
    test_input = {"Ingredients": original_ingredients}
    
    # Predict allergens using the improved detection function
    predicted_result = improved_allergens_detection(test_input['Ingredients'])
    
    # Get actual allergens from the test labels
    actual_allergens = [
        allergen for allergen, present in zip(common_allergens, y_test_multilabel.iloc[random_idx]) if present
    ]

    # Display the results
    print(f"\n🔹 Test Sample {i + 1}")
    print("📝 Ingredients:", original_ingredients)
    print("✅ Predicted:", predicted_result)
    print("🎯 Actual:", ", ".join(actual_allergens) if actual_allergens else "No allergens listed")
    print("------------------------------------------------------------")



🔹 Test Sample 1
📝 Ingredients: milk solids sugar water liquid glucose fructose lodised salt humectant 422 emulsifier 471 stabilizers 410 412 407 color 133 natural nature identical flavoring substances alls
✅ Predicted: Allergens present: milk
🎯 Actual: milk
------------------------------------------------------------

🔹 Test Sample 2
📝 Ingredients: wheat semolina allergen advice contains wheat gluten
✅ Predicted: Allergens present: gluten, wheat
🎯 Actual: wheat, gluten
------------------------------------------------------------

🔹 Test Sample 3
📝 Ingredients: wheat solids 46 refined wheat flour maida 33 3 whole wheat flour atta 12 6 sugar corn flour oat flour rice flour cocoa powder 2 6 liquid glucose wheat bran palmolein oil malt extract iodised salt artificial flavouring sustances chocolate vanilla honey calcium carbonate soya lecithin ins 322 i minerals and vitamins natural colour ins 150d tocopherol ins 307b
✅ Predicted: Allergens present: gluten, soy, wheat
🎯 Actual: soy, wheat,

In [127]:
test_examples = [
    {
        "Ingredients": "Rolled Oats (75%), Maltodextrin, Spices and Condiments (5.8%) (Onion, Pepper, Garlic, Coriander, Cumin, Fenugreek, Ginger, Turmeric, Cinnamon, Clove), Salt, Sugar, Dried Vegetables (Carrot, French Beans, Tomato), Hydrolyzed Vegetable Protein, Wheat Powder, Flavour Enhancer, Antioxidant, Contains Added Flavour."
    },
    {
        "Ingredients": "OATS (16.9%), SUGAR, WHEAT FLAKES (14.3%), CORN GRITS (10.5%), DRIED PAPAYA (8.5%), BARLEY FLAKES (6.8%), RICE (6.3%), BLACK RAISINS (5%), SLICED ALMONDS (5%), SOYA LECITHIN (INS 322 (i))"
    },
    {
        "Ingredients": "Cocoa (Cocoa Nibs 51%, Cocoa Butter 20%), Date Powder (29%). Contains Milk Derivatives."
    },
    {
        "Ingredients": "Refined Wheat Flour (Maida), Sugar, Edible Vegetable Oil (Palm), Invert Sugar Syrup, Milk Solids, Cocoa Solids, Emulsifiers (Soy Lecithin), Raising Agents, Salt, Artificial Flavors."
    },
    {
        "Ingredients": "Water, Sugar, Dextrose, Orange Juice (10%), Trisodium Citrate, Potassium Chloride, Sodium Chloride, Acidity Regulator, Ascorbic Acid, Natural and Artificial Flavouring Substances, Colours, Preservative INS 211."
    },
    {
        "Ingredients": "Peanuts, Cashew Nuts, Almonds, Raisins, Salt, Edible Vegetable Oil, Spices (Red Chili Powder, Turmeric), Preservatives (INS 211)"
    },
    {
        "Ingredients": "Soy Protein Isolate, Maltodextrin, Cocoa Powder, Natural Flavours, Stevia, Whey Protein Concentrate, Emulsifier (Soy Lecithin)"
    },
    {
        "Ingredients": "Fresh Atlantic Salmon, Sea Salt, Natural Smoked Flavor"
    },
    {
        "Ingredients": "Organic Flax Seeds, Chia Seeds, Pumpkin Seeds, Sunflower Seeds"
    },
    {
        "Ingredients": "Prawns (Crustaceans), Garlic, Butter (Milk), Parsley, Lemon Juice"
    }
]


In [129]:
print("✅ Allergen Detection Test on Custom Examples")

for idx, sample in enumerate(test_examples):
    ingredients = sample["Ingredients"]
    predicted_allergens = improved_allergens_detection(ingredients)

    print(f"\n🔹 Test Example {idx + 1}")
    print(f"📝 Ingredients: {ingredients}")
    print(f"✅ Predicted: {predicted_allergens}")
    print("------------------------------------------------------------")


✅ Allergen Detection Test on Custom Examples

🔹 Test Example 1
📝 Ingredients: Rolled Oats (75%), Maltodextrin, Spices and Condiments (5.8%) (Onion, Pepper, Garlic, Coriander, Cumin, Fenugreek, Ginger, Turmeric, Cinnamon, Clove), Salt, Sugar, Dried Vegetables (Carrot, French Beans, Tomato), Hydrolyzed Vegetable Protein, Wheat Powder, Flavour Enhancer, Antioxidant, Contains Added Flavour.
✅ Predicted: Allergens present: gluten, wheat
------------------------------------------------------------

🔹 Test Example 2
📝 Ingredients: OATS (16.9%), SUGAR, WHEAT FLAKES (14.3%), CORN GRITS (10.5%), DRIED PAPAYA (8.5%), BARLEY FLAKES (6.8%), RICE (6.3%), BLACK RAISINS (5%), SLICED ALMONDS (5%), SOYA LECITHIN (INS 322 (i))
✅ Predicted: Allergens present: gluten, soy, tree nuts, wheat
------------------------------------------------------------

🔹 Test Example 3
📝 Ingredients: Cocoa (Cocoa Nibs 51%, Cocoa Butter 20%), Date Powder (29%). Contains Milk Derivatives.
✅ Predicted: Allergens present: milk
-

In [131]:
import joblib


In [133]:
import os

# Create a 'model' folder if it doesn't exist
if not os.path.exists("model"):
    os.makedirs("model")


In [135]:
# Save the Multi-Label Classification Model
joblib.dump(rf_multi_label, "model/allergen_model.pkl")

# Save the TF-IDF Vectorizer
joblib.dump(vectorizer, "model/allergen_vectorizer.pkl")

# (Optional) Save the Binary Classification Model if you want
joblib.dump(rf_binary, "model/binary_allergen_model.pkl")

print("✅ Models and Vectorizer saved successfully!")


✅ Models and Vectorizer saved successfully!
