In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
# Load datasets
recipes = pd.read_csv("recipes.csv")

In [5]:
# Display basic information about the dataset
print("Dataset shape:", recipes.shape)
print("Columns:", recipes.columns)

Dataset shape: (231637, 12)
Columns: Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')


In [6]:
# Step 2: Data cleaning and preprocessing
# Focus only on columns needed: 'minutes' and 'ingredients'
recipes = recipes[['name', 'minutes', 'ingredients', 'n_steps']]

In [7]:
# Drop rows with missing values
recipes.dropna(inplace=True)

In [8]:
# Reset the index
recipes.reset_index(drop=True, inplace=True)

In [9]:
# Step 3: Preprocessing numerical features ('minutes')
scaler = StandardScaler()
recipes['scaled_minutes'] = scaler.fit_transform(recipes[['minutes']])

In [10]:
# Step 4: Preprocessing text features ('ingredients')
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
ingredients_tfidf = tfidf_vectorizer.fit_transform(recipes['ingredients'])

In [11]:
# Step 5: Combine features
# Combine scaled 'minutes' and TF-IDF features for 'ingredients'
combined_features = np.hstack([recipes[['scaled_minutes']].values, ingredients_tfidf.toarray()])

In [12]:
# Step 6: Save preprocessed models and data
with open("recipemodel_tfidf.pkl", "wb") as tfidf_file:
    pickle.dump(tfidf_vectorizer, tfidf_file)

with open("recipemodel_scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

with open("recipemodel_combined_features.pkl", "wb") as combined_file:
    pickle.dump(combined_features, combined_file)
    

# Save the processed dataset
recipes.to_csv("processed_recipes.csv", index=False)

print("Preprocessing complete. Models and data saved.")

Preprocessing complete. Models and data saved.
