In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd

drive_link = '/content/drive/My Drive/openfoodfacts.tsv'

# Specify tab as the separator
input_file = pd.read_csv(drive_link, sep='\t', low_memory=False)

print("File loaded.")
print(input_file.head())

File loaded.
            code                                                url  \
0  0000000003087  http://world-en.openfoodfacts.org/product/0000...   
1  0000000004530  http://world-en.openfoodfacts.org/product/0000...   
2  0000000004559  http://world-en.openfoodfacts.org/product/0000...   
3  0000000016087  http://world-en.openfoodfacts.org/product/0000...   
4  0000000016094  http://world-en.openfoodfacts.org/product/0000...   

                      creator   created_t      created_datetime  \
0  openfoodfacts-contributors  1474103866  2016-09-17T09:17:46Z   
1             usda-ndb-import  1489069957  2017-03-09T14:32:37Z   
2             usda-ndb-import  1489069957  2017-03-09T14:32:37Z   
3             usda-ndb-import  1489055731  2017-03-09T10:35:31Z   
4             usda-ndb-import  1489055653  2017-03-09T10:34:13Z   

  last_modified_t last_modified_datetime                    product_name  \
0      1474103893   2016-09-17T09:18:13Z              Farine de blé noir   
1    

In [13]:
import pandas as pd

drive_link = '/content/drive/My Drive/openfoodfacts.tsv'
output_file = 'output.csv'

df = pd.read_csv(drive_link, sep='\t', low_memory=False)
df.to_csv(output_file, index=False)

print(f"Conversion completed. Saved as: {output_file}")

Conversion completed. Saved as: output.csv


In [16]:
# =============================================================
# === FOOD SAFETY ENSEMBLE MODEL - UPDATED FOR OUTPUT.CSV ===
# =============================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

# =============================================================
# 1. Load Dataset
# =============================================================
print("Loading dataset...")
df = pd.read_csv("/content/output.csv", low_memory=False)
print("Total rows:", len(df))

# =============================================================
# 2. Basic Cleaning and Label Generation
# =============================================================

# Use nutrition-score-fr_100g as a health proxy
df = df[~df["nutrition-score-fr_100g"].isna()].copy()

# Convert to numeric (force errors to NaN)
df["nutrition-score-fr_100g"] = pd.to_numeric(df["nutrition-score-fr_100g"], errors="coerce")
df = df.dropna(subset=["nutrition-score-fr_100g"])

# Create categorical safety labels
# NutriScore ranges: -15 (best) to +40 (worst)
def label_from_nutriscore(score):
    if score <= 0:
        return "Very Healthy"
    elif score <= 5:
        return "Healthy"
    elif score <= 15:
        return "Less Healthy"
    else:
        return "Unhealthy"

df["label"] = df["nutrition-score-fr_100g"].apply(label_from_nutriscore)

# =============================================================
# 3. Feature Selection
# =============================================================

feature_cols = [
    "energy_100g", "fat_100g", "saturated-fat_100g",
    "carbohydrates_100g", "sugars_100g", "fiber_100g",
    "proteins_100g", "salt_100g", "sodium_100g"
]

# Ensure all features exist
for col in feature_cols:
    if col not in df.columns:
        df[col] = np.nan

df = df.dropna(subset=feature_cols)
df = df.fillna(0)

# Encode labels
label_map = {"Unhealthy": 0, "Less Healthy": 1, "Healthy": 2, "Very Healthy": 3}
df["label_encoded"] = df["label"].map(label_map)

# Shuffle for balanced learning
df = shuffle(df, random_state=42)

X = df[feature_cols]
y = df["label_encoded"]

# =============================================================
# 4. Split Data
# =============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining on {len(X_train)} samples...")
print("Label distribution:", np.bincount(y_train))

# =============================================================
# 5. Model 1 - LightGBM
# =============================================================
print("\nTraining LightGBM...")
lgb_model = lgb.LGBMClassifier(objective="multiclass", num_class=4, n_estimators=200)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
acc_lgb = accuracy_score(y_test, y_pred_lgb)
print("LightGBM Test Accuracy:", round(acc_lgb, 4))

# =============================================================
# 6. Model 2 - Hybrid Neural Network
# =============================================================
print("\nTraining Hybrid Neural Network...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

hybrid_nn = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(4, activation='softmax')
])

hybrid_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
hybrid_nn.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)
acc_nn = hybrid_nn.evaluate(X_test_scaled, y_test, verbose=0)[1]
print("Hybrid NN Test Accuracy:", round(acc_nn, 4))

# =============================================================
# 7. Model 3 - Logistic Regression
# =============================================================
print("\nTraining Logistic Regression...")
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_scaled, y_train)
acc_lr = accuracy_score(y_test, log_reg.predict(X_test_scaled))
print("Logistic Regression Test Accuracy:", round(acc_lr, 4))

# =============================================================
# 8. Meta-Learner (Stacked Ensemble)
# =============================================================
print("\nTraining Meta-Learner (Stacking)...")

train_meta = np.column_stack([
    lgb_model.predict_proba(X_train),
    hybrid_nn.predict(X_train_scaled),
    log_reg.predict_proba(X_train_scaled)
])

test_meta = np.column_stack([
    lgb_model.predict_proba(X_test),
    hybrid_nn.predict(X_test_scaled),
    log_reg.predict_proba(X_test_scaled)
])

meta_model = lgb.LGBMClassifier(objective="multiclass", num_class=4)
meta_model.fit(train_meta, y_train)
acc_meta = accuracy_score(y_test, meta_model.predict(test_meta))
print("Meta-Learner (Stack) Test Accuracy:", round(acc_meta, 4))

# =============================================================
# 9. Final Blended Model Evaluation
# =============================================================
blend_probs = (
    0.4 * lgb_model.predict_proba(X_test)
    + 0.4 * hybrid_nn.predict(X_test_scaled)
    + 0.2 * log_reg.predict_proba(X_test_scaled)
)
blend_preds = np.argmax(blend_probs, axis=1)
acc_blend = accuracy_score(y_test, blend_preds)
print("Blended Model Test Accuracy:", round(acc_blend, 4))

# =============================================================
# 10. Print Summary
# =============================================================
print("\n==================================================")
print("TRAINING COMPLETE - Model Performance Summary")
print("==================================================")
print(f"LightGBM:              {acc_lgb:.4f}")
print(f"Hybrid NN:             {acc_nn:.4f}")
print(f"Logistic Regression:   {acc_lr:.4f}")
print(f"Meta-Learner (Stack):  {acc_meta:.4f}")
print(f"Blended Average:       {acc_blend:.4f}")
print("==================================================")

# =============================================================
# 11. Safety Prediction Function
# =============================================================
def predict_safety(product_name, ingredients_text, energy, fat, sat_fat, carbs, sugars, fiber, protein, salt, sodium):
    features = pd.DataFrame([{
        "energy_100g": energy,
        "fat_100g": fat,
        "saturated-fat_100g": sat_fat,
        "carbohydrates_100g": carbs,
        "sugars_100g": sugars,
        "fiber_100g": fiber,
        "proteins_100g": protein,
        "salt_100g": salt,
        "sodium_100g": sodium
    }])

    X_scaled = scaler.transform(features)

    probs = (
        0.4 * lgb_model.predict_proba(features)
        + 0.4 * hybrid_nn.predict(X_scaled)
        + 0.2 * log_reg.predict_proba(X_scaled)
    )

    pred_class = np.argmax(probs)
    pred_prob = np.max(probs)

    reverse_label_map = {v: k for k, v in label_map.items()}
    category = reverse_label_map[pred_class]

    # Convert to safety score (0–10)
    safety_score = probs[0][3] * 10  # Probability of 'Very Healthy'
    safety_score = round(float(np.clip(safety_score, 0, 10)), 2)

    print(f"\n=== EXAMPLE PREDICTION ===")
    print(f"Product: {product_name}")
    print(f"Predicted Category: {category}")
    print(f"Model Confidence: {pred_prob:.3f}")
    print(f"Safety Score: {safety_score}/10")

    return category, safety_score


Loading dataset...
Total rows: 356027


  df = df.fillna(0)



Training on 157728 samples...
Label distribution: [39917 52105 27868 37838]

Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2294
[LightGBM] [Info] Number of data points in the train set: 157728, number of used features: 9
[LightGBM] [Info] Start training from score -1.374070
[LightGBM] [Info] Start training from score -1.107611
[LightGBM] [Info] Start training from score -1.733393
[LightGBM] [Info] Start training from score -1.427558
LightGBM Test Accuracy: 0.9638

Training Hybrid Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Hybrid NN Test Accuracy: 0.8594

Training Logistic Regression...
Logistic Regression Test Accuracy: 0.7506

Training Meta-Learner (Stacking)...
[1m4929/4929[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step
[1m1233/1233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 157728, number of used features: 12
[LightGBM] [Info] Start training from score -1.374070
[LightGBM] [Info] Start training from score -1.107611
[LightGBM] [Info] Start training from score -1.733393
[LightGBM] [Info] Start training from score -1.427558




Meta-Learner (Stack) Test Accuracy: 0.9658
[1m1233/1233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Blended Model Test Accuracy: 0.936

TRAINING COMPLETE - Model Performance Summary
LightGBM:              0.9638
Hybrid NN:             0.8594
Logistic Regression:   0.7506
Meta-Learner (Stack):  0.9658
Blended Average:       0.9360


In [17]:
category, score = predict_safety(
    product_name="Organic Whole Wheat Bread",
    ingredients_text="whole wheat flour, water, yeast, salt, honey",
    energy=240, fat=3, sat_fat=0.5, carbs=45, sugars=4, fiber=5, protein=9, salt=1, sodium=0.4
)

category, score = predict_safety(
    product_name="Steamed Quinoa with Vegetables",
    ingredients_text="quinoa, spinach, broccoli, carrot, olive oil, lemon juice",
    energy=120, fat=2, sat_fat=0.3, carbs=21, sugars=1.5, fiber=4.5, protein=6, salt=0.3, sodium=0.12
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step

=== EXAMPLE PREDICTION ===
Product: Organic Whole Wheat Bread
Predicted Category: Very Healthy
Model Confidence: 0.913
Safety Score: 9.13/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step

=== EXAMPLE PREDICTION ===
Product: Steamed Quinoa with Vegetables
Predicted Category: Very Healthy
Model Confidence: 0.970
Safety Score: 9.7/10


In [18]:
category, score = predict_safety(
    product_name="Organic Whole Wheat Bread",
    ingredients_text="whole wheat flour, water, yeast, salt, honey",
    energy=240, fat=3, sat_fat=0.5, carbs=45, sugars=4, fiber=5, protein=9, salt=1, sodium=0.4
)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step

=== EXAMPLE PREDICTION ===
Product: Organic Whole Wheat Bread
Predicted Category: Very Healthy
Model Confidence: 0.913
Safety Score: 9.13/10


In [19]:
category, score = predict_safety(
    product_name="Steamed Quinoa with Vegetables",
    ingredients_text="quinoa, spinach, broccoli, carrot, olive oil, lemon juice",
    energy=120, fat=2, sat_fat=0.3, carbs=21, sugars=1.5, fiber=4.5, protein=6, salt=0.3, sodium=0.12
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step

=== EXAMPLE PREDICTION ===
Product: Steamed Quinoa with Vegetables
Predicted Category: Very Healthy
Model Confidence: 0.970
Safety Score: 9.7/10
