In [8]:
! pip install spacy

Collecting spacy
  Using cached spacy-3.8.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.12-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)

In [14]:
from flask import Flask, render_template, request
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [12]:
# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Load Dataset
path = 'clean_data_with_same_units.csv'
df = pd.read_csv(path, index_col=0)

# Store a copy of the original numeric nutrition data
df_original = df.copy()

# Function to process food names using NLP
def parse_food_name(food_name):
    """Extract meaningful components from food name using NLP."""
    doc = nlp(food_name.lower())
    return [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization & stop-word removal

# Apply NLP processing
df["parsed_food"] = df["food"].apply(parse_food_name)

# Extract unique components
all_components = list(set(sum(df["parsed_food"].tolist(), [])))

# Create a binary matrix (food × components)
component_df = pd.DataFrame(0, index=df.index, columns=all_components)

# Populate the one-hot encoded matrix
for i, components in enumerate(df["parsed_food"]):
    for component in set(components):
        if component in component_df.columns:
            component_df.at[i, component] = 1

# Process parsed components separately
component_df = component_df.copy()

# Verify numeric columns
all_nutrition_columns = df_original.select_dtypes(include=[np.number]).columns.tolist()
all_nutrition_columns.remove("Nutrition Density")  # Exclude target variable

# Convert each numeric column in the original dataset
for col in all_nutrition_columns:
    df_original[col] = pd.to_numeric(df_original[col], errors='coerce').fillna(0)

# Solve least squares problem
X = component_df.values  # One-hot matrix
Y = df_original[all_nutrition_columns].values.astype(np.float64)  # Nutrient matrix

W, residuals, rank, s = np.linalg.lstsq(X, Y, rcond=None)

# Convert to DataFrame
component_nutrition_all = pd.DataFrame(W, index=component_df.columns, columns=all_nutrition_columns)

# Create prediction matrices
component_nutrition_prediction = component_nutrition_all.copy()  # Keep negatives for model
component_nutrition_display = component_nutrition_all.clip(lower=0)  # Clip negatives for display

# Initialize Flask app
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    predicted_density = None

    if request.method == "POST":
        food_name = request.form["food_name"]
        components = parse_food_name(food_name)

        # Check for known components
        known_components = [c for c in components if c in component_nutrition_prediction.index]

        if not known_components:
            predicted_density = f"⚠️ '{food_name}' is not available in our database yet. Please wait for future updates."
        else:
            # Sum nutrition contributions
            summed_nutrition = component_nutrition_prediction.loc[known_components].sum()

            # Predict Nutrition Density (Mocking it here, replace with your ML model)
            predicted_density = f"Predicted Nutrition Density for '{food_name}': {summed_nutrition.mean():.4f}"

    return render_template("index.html", predicted_density=predicted_density)

if __name__ == "__main__":
    app.run(debug=True)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.