In [2]:
# 1. Install Dependencies

!pip install pandas scikit-learn joblib nltk



In [3]:
# 2. Imports & Setup

import pandas as pd
import numpy as np
import joblib
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# Setup Lemmatizer and Stop Words
lemmatizer = WordNetLemmatizer()

# Create a set of standard English stop words
english_stop_words = set(stopwords.words('english'))

# Add common recipe/ingredient modifiers/units that don't help classification
custom_recipe_stop_words = {
    'fresh', 'dry', 'frozen', 'cup', 'ounce', 'pound', 'tablespoon', 'teaspoon',
    'clove', 'package', 'can', 'jar', 'diced', 'chopped', 'minced', 'sliced',
    'pinch', 'large', 'small', 'medium', 'container', 'dash', 'serving', 'to',
    'oz', 'tbsp', 'tsp', 'g', 'kg', 'ml', 'l', 'of', 'and', 'with'
}

# Combine stop words for the final list
ALL_STOP_WORDS = english_stop_words.union(custom_recipe_stop_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# 3. Load Dataset

# NOTE: Ensure 'train.json' is uploaded to the Colab environment or mount via Kaggle/Drive.
DATA_PATH = "train.json"
df = pd.read_json(DATA_PATH)
print(f"Number of recipes loaded: {len(df)}")
df.head()

Number of recipes loaded: 39774


Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
# 4. Preprocess Ingredients (Cleaning, Lemmatization, Stop Word Removal)

def clean_ingredient(ing: str) -> str:
    """Normalizes, cleans, and removes units/modifiers from a single ingredient string."""
    ing = ing.lower()
    # Remove non-alphabet chars (like numbers and punctuation)
    ing = re.sub(r'[^a-z\s]', ' ', ing)
    # Collapse multiple spaces to a single space and strip whitespace
    ing = re.sub(r'\s+', ' ', ing).strip()

    # Tokenize, Lemmatize, and remove Stop Words
    words = [
        lemmatizer.lemmatize(w)
        for w in ing.split()
        if w not in ALL_STOP_WORDS
    ]
    return " ".join(words)

def preprocess_ingredients(ings_list: list) -> list:
    """Applies clean_ingredient to every ingredient in a list."""
    return [clean_ingredient(ing) for ing in ings_list]

# Apply cleaning to the list of ingredients
df["clean_ings"] = df["ingredients"].apply(preprocess_ingredients)

# Combine the cleaned ingredient lists into a single string for TF-IDF
df["clean_ings_str"] = df["clean_ings"].apply(lambda ings: " ".join(ings))

# Inspect a sample
print("\n--- Sample Preprocessing ---")
print(f"Original: {df.loc[0, 'ingredients']}")
print(f"Cleaned String: {df.loc[0, 'clean_ings_str']}")



--- Sample Preprocessing ---
Original: ['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper', 'purple onion', 'seasoning', 'garbanzo beans', 'feta cheese crumbles']
Cleaned String: romaine lettuce black olive grape tomato garlic pepper purple onion seasoning garbanzo bean feta cheese crumbles


In [8]:
# 5. Vectorize Recipes using TFâ€‘IDF

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the cleaned ingredient strings
tfidf_matrix = vectorizer.fit_transform(df["clean_ings_str"])

print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of unique features (ingredients): {len(vectorizer.get_feature_names_out())}")



TF-IDF matrix shape: (39774, 2733)
Number of unique features (ingredients): 2733


In [9]:
# 6. Save Model Artifacts & Cleaned Data for Backend

# 1. Save the cleaned dataframe (for fast loading of metadata in backend)
df_metadata = df[["id", "cuisine", "ingredients", "clean_ings_str"]]
df_metadata.to_csv("cleaned_recipes.csv", index=False)
print("Saved cleaned_recipes.csv")

# 2. Save the trained vectorizer and recipe vectors (ML artifacts)
# These files should ideally be moved to the 'backend/model/' directory for deployment
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(tfidf_matrix, "recipe_vectors.pkl")

# 3. Save the recipe metadata (optional but often useful)
metadata_cols = ["id", "cuisine", "ingredients"]
joblib.dump(df[metadata_cols].to_dict(orient="records"), "recipes_metadata.pkl")

print("ML artifacts (vectorizer, vectors, metadata) saved!")


Saved cleaned_recipes.csv
ML artifacts (vectorizer, vectors, metadata) saved!


In [10]:
# 7. Test Recommendation Function (for validation)

# NOTE: This function uses the objects (df, vectorizer, tfidf_matrix) loaded in this notebook.

def recommend_recipes(user_ingredients: str, top_n=5):
    """Generates recommendations based on cosine similarity to user input ingredients."""
    # Clean & preprocess user input similar to training
    user_ings = [i.strip() for i in user_ingredients.split(",") if i.strip()]
    user_clean = [clean_ingredient(ing) for ing in user_ings]
    user_text = " ".join(user_clean)

    # Transform user input using the trained vectorizer
    user_vec = vectorizer.transform([user_text])

    # Calculate Cosine Similarity between user vector and all recipe vectors
    #
    cos_sim = cosine_similarity(user_vec, tfidf_matrix).flatten()

    # Get the indices of the top N highest similarity scores
    top_indices = np.argsort(cos_sim)[::-1][:top_n]

    recs = []
    for idx in top_indices:
        rec = {
            "id": df.iloc[idx]["id"],
            "cuisine": df.iloc[idx]["cuisine"],
            "ingredients": df.iloc[idx]["ingredients"],
            "score": cos_sim[idx]
        }
        recs.append(rec)
    return recs

In [11]:
# 8. Run Test Recommendation

user_input = "chicken, garlic, onion, rice, soy sauce"
recs = recommend_recipes(user_input, top_n=5)

print(f"--- Top 5 Recommendations for: {user_input} ---")
for i, r in enumerate(recs, 1):
    print(f"\n{i}. Recipe ID: {r['id']} ({r['cuisine']})")
    print(f"   Match Score: {r['score']:.4f}")
    print(f"   Ingredients: {r['ingredients']}")

--- Top 5 Recommendations for: chicken, garlic, onion, rice, soy sauce ---

1. Recipe ID: 21333 (japanese)
   Match Score: 0.6473
   Ingredients: ['brown rice', 'white rice', 'soy sauce']

2. Recipe ID: 23111 (vietnamese)
   Match Score: 0.5916
   Ingredients: ['soy sauce', 'rice', 'onions', 'salt', 'garlic cloves', 'vinegar', 'oil', 'eggs', 'chili sauce', 'celery']

3. Recipe ID: 33669 (chinese)
   Match Score: 0.5821
   Ingredients: ['spring onions', 'rice', 'brown sugar', 'sesame oil', 'chicken stock', 'chicken breasts', 'soy sauce', 'garlic']

4. Recipe ID: 10348 (chinese)
   Match Score: 0.5818
   Ingredients: ['chili oil', 'rice vinegar', 'soy sauce']

5. Recipe ID: 38969 (japanese)
   Match Score: 0.5818
   Ingredients: ['rice vinegar', 'soy sauce', 'chili oil']
