In [98]:
import pandas as pd
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix
import numpy as np
import joblib 
from sklearn.preprocessing import LabelEncoder

In [16]:
api=KaggleApi()
api.authenticate()

api.dataset_download_file(
    'paultimothymooney/recipenlg',
    file_name='RecipeNLG_dataset.csv',
    path='../data'
)

original_path = '../data/RecipeNLG_dataset.csv'
zip_path = '../data/RecipeNLG_dataset.zip'
os.rename(original_path, zip_path)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('../data')
os.remove(zip_path)

input_path='../data/RecipeNLG_dataset.csv'
output_path='../data/train-data.csv'

Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/recipenlg


In [103]:
print(f'Reading dataset from {input_path}')
data = pd.read_csv(input_path)
print(f'Dataset imported. Shape: {data.shape}')

Reading dataset from ../data/RecipeNLG_dataset.csv
Dataset imported. Shape: (2231142, 7)


In [104]:
print(f'Cleaning dataset...')
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
columns_to_drop = ["Unnamed: 0", "ingredients", "directions", "link", "source"]
data.drop(columns=columns_to_drop, inplace=True)
data.rename(columns={"NER": "ingredients"}, inplace=True)
print(f'Shape: {data.shape}')

Cleaning dataset...
Shape: (2231141, 2)


In [105]:
data = data.head(10000)
print(data)

                         title  \
0          No-Bake Nut Cookies   
1        Jewell Ball'S Chicken   
2                  Creamy Corn   
3                Chicken Funny   
4         Reeses Cups(Candy)     
...                        ...   
9995          Pink Fruit Salad   
9996            Peppered Steak   
9997         Chicken Casserole   
9998  Sweet Potatoes Casserole   
9999             7 Layer Salad   

                                            ingredients  
0     ["brown sugar", "milk", "vanilla", "nuts", "bu...  
1     ["beef", "chicken breasts", "cream of mushroom...  
2     ["frozen corn", "cream cheese", "butter", "gar...  
3     ["chicken", "chicken gravy", "cream of mushroo...  
4     ["peanut butter", "graham cracker crumbs", "bu...  
...                                                 ...  
9995  ["cherry pie filling", "condensed milk", "pine...  
9996  ["bell pepper", "onion", "tomatoes", "salt", "...  
9997  ["fryer", "onion", "green pepper", "celery", "...  
9998  ["yam

In [None]:
print('Modifing dataset...')
data['ingredients'] = data['ingredients'].apply(ast.literal_eval)

mlb = MultiLabelBinarizer(sparse_output=True)
ingredient_matrix = mlb.fit_transform(data['ingredients'])

ingredient_df = pd.DataFrame.sparse.from_spmatrix(
    ingredient_matrix, 
    index=data.index, 
    columns=mlb.classes_
)

data = pd.concat([data, ingredient_df], axis=1)
data.drop(columns=['ingredients'], inplace=True)
print(f'Final shape: {data.shape}')

print('Exporting new dataset...')
data.to_csv(output_path)
print(f'New dataset exported to {output_path}')


top_recipes = data['title'].value_counts().nlargest(1000).index
data = data[data['title'].isin(top_recipes)]



Modifing dataset...
Final shape: (10000, 4182)
Exporting new dataset...
New dataset exported to ../data/train-data.csv


ValueError: Found input variables with inconsistent numbers of samples: [10000, 3801]

In [110]:
mlb = MultiLabelBinarizer()

X_sparse = ingredient_df.sparse.to_coo()
print(X_sparse.shape)
le = LabelEncoder()
y_sparse = data['title']
print(len(data['title']))
print(y_sparse.shape)
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y_sparse, test_size=0.2, random_state=42)


model = OneVsRestClassifier(
    LogisticRegression(
        solver='saga',  # good for large sparse data
        max_iter=1000,
        class_weight='balanced',  # optional, if classes are unbalanced
        n_jobs=-1
    )
)

(10000, 4181)
3801
(3801,)


ValueError: Found input variables with inconsistent numbers of samples: [10000, 3801]

In [None]:
print("Training model...")
model.fit(X_train, y_train)
print("Done.")

# Predict probabilities (sigmoid output)
y_pred_proba = model.predict_proba(X_test)

# Example: get top-10 recommended recipes for each test sample
top_k = 10
top_10_preds = np.argsort(-y_pred_proba, axis=1)[:, :top_k]  # sort and take top-k

# Decode recipe names
for i, row in enumerate(top_10_preds[:5]):
    print(f"Sample {i+1} top predictions: {[mlb.classes_[j] for j in row]}")

# Optional: Save model and MultiLabelBinarizer
joblib.dump(model, 'recipe_recommender_model.joblib')
joblib.dump(mlb, 'title_mlb.joblib')

Training model...




Done.
Sample 1 top predictions: ['Blueberry Surprise', 'Chicken Divan', 'Quick Peppermint Puffs', "Dave'S Corn Casserole", 'Brickle Bars', 'Fast Real Good Fudge', 'Chicken Casserole', 'Vegetable-Burger Soup', 'Monkey Bread', 'Angel Biscuits']
Sample 2 top predictions: ['Sweet-N-Sour Chicken', 'Crazy Peanut Butter Cookies', 'Beer Bread', 'Broccoli Salad', 'Strawberry Pie', "Moist Devil'S Food Cake", 'Vegetable-Burger Soup', 'Mexican Cookie Rings', 'Fresh Strawberry Pie', "Nolan'S Pepper Steak"]
Sample 3 top predictions: ['Eggless Milkless Applesauce Cake', "Grandma Hanrath'S Banana Breadfort Collins, Colorado  ", 'Apple Crisp', 'Mexican Cookie Rings', 'Sweet-N-Sour Chicken', "Moist Devil'S Food Cake", 'Beer Bread', 'Mulled Cider', 'Quick Coffee Cake(6 Servings)  ', 'Monkey Bread']
Sample 4 top predictions: ['Beer Bread', 'Apple Crisp', 'Mexican Cookie Rings', "Grandma Hanrath'S Banana Breadfort Collins, Colorado  ", 'Pound Cake', "Moist Devil'S Food Cake", 'Gooey Coffee Cake', 'Crazy Pe



['title_mlb.joblib']