In [None]:
import pandas as pd
import numpy as np

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/recipes.csv')

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes.csv', quoting=3, escapechar='\\', on_bad_lines='skip')

In [None]:
df3 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/recipes_data.csv',converters={
        'ingredients': str,
        'directions': str,
        'NER': str
    })

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df3.info()

In [None]:
df1.head()

In [None]:
# Drop the unnamed index column
df1.drop(columns=['Unnamed: 0'], inplace=True)

# Fill missing time-related fields
df1['prep_time'] = df1['prep_time'].fillna("Unknown")
df1['cook_time'] = df1['cook_time'].fillna("Unknown")
df1['total_time'] = df1['total_time'].fillna("Unknown")

# Standardize 'yield'
df1['yield'] = df1['yield'].fillna("Varies")

# Replace NaNs in ingredients and directions with empty strings to avoid breaking parsing
df1['ingredients'] = df1['ingredients'].fillna("")
df1['directions'] = df1['directions'].fillna("")

# Parse ingredients as a list (split by comma)
df1['ingredients'] = df1['ingredients'].apply(lambda x: [i.strip() for i in x.split(',') if i.strip()])

# Split directions into steps (split by period, question mark, or exclamation mark optionally)
import re
df1['directions'] = df1['directions'].apply(lambda x: [step.strip() for step in re.split(r'[.?!]\s+', x) if step.strip()])

#Clean nutrition (keep as-is or parse specific macros later)
df1['nutrition'] = df1['nutrition'].fillna("Not Available")

# Fill remaining columns with defaults if needed
df1['img_src'] = df1['img_src'].fillna("https://via.placeholder.com/150")
df1['cuisine_path'] = df1['cuisine_path'].fillna("Uncategorized")
df1['timing'] = df1['timing'].fillna("Not specified")

# Ensure rating is float and fill 0 where missing
df1['rating'] = df1['rating'].fillna(0.0).astype(float)
df1['servings'] = df1['servings'].fillna(0).astype(int)

In [None]:
def format_recipe_row(row):
    return {
        "name": row["recipe_name"],
        "ingredients": row["ingredients"],
        "steps": row["directions"],
        "prep_time": row["prep_time"],
        "cook_time": row["cook_time"],
        "total_time": row["total_time"],
        "servings": row["servings"],
        "yield": row["yield"],
        "rating": row["rating"],
        "url": row["url"],
        "img_src": row["img_src"],
        "cuisine": row["cuisine_path"],
        "nutrition": row["nutrition"]
    }

recipes = df1.apply(format_recipe_row, axis=1).tolist()

In [None]:
import json

with open("nomnom_clean_recipes.json", "w") as f:
    json.dump(recipes, f, indent=2)

In [None]:
df3.head()

In [None]:
df3.head(1)

In [None]:
print(df3.shape)
print(df3.columns)

In [None]:
df3_cp = df3.copy()

In [None]:
import ast
# Define safe eval
def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []

# Apply to necessary columns
for col in ['ingredients', 'directions', 'NER']:
    df3[col] = df3[col].apply(safe_eval)


In [None]:
df3['title'] = df3['title'].fillna("Unnamed Recipe")
df3['link'] = df3['link'].fillna("#")
df3['source'] = df3['source'].fillna("Unknown")
df3['site'] = df3['site'].fillna("Unknown")

In [None]:
df3_cp.head()

In [None]:
def format_df3_recipe(row):
    return {
        "name": row['title'],
        "ingredients": row['ingredients'],
        "steps": row['directions'],
        "source": row['source'],
        "link": row['link'],
        "entities": row['NER'],
        "site": row['site']
    }

recipes_df3 = df3_cp.apply(format_df3_recipe, axis=1).tolist()

In [None]:
import json

with open("/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_df3_cleaned.json", "w") as f:
    json.dump(recipes_df3, f, indent=2)

In [None]:
df2.head()

In [None]:
df2 = df2.dropna(thresh=6)

In [None]:
df2.info()

In [None]:
df2 = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes.csv', header=0,
    sep=',',
    quoting=3,
    escapechar='\\',
    on_bad_lines='skip',
    engine='python',dtype=str
)

In [None]:
df2.head(2)

In [None]:
df2.reset_index(drop=True, inplace=True)

In [None]:
print(df2.shape)
print(df2.columns)

In [None]:
df2 = df2.dropna(thresh=6)

In [None]:
df2.info()

In [None]:
#if isinstance(df2.index, pd.MultiIndex):
   # df2.reset_index(inplace=True)

In [None]:
#print(df2.shape)
#print(df2.columns)
#print(type(df2.index))

In [None]:
import re

input_path = '/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes.csv'
output_path = '/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes_cleaned.csv'

cleaned_lines = []
buffer = ""
expected_columns = 12  # Number of CSV fields/columns

with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
    header = f.readline()
    cleaned_lines.append(header.strip())

    for line in f:
        buffer += line.strip()

        # Count commas to guess if the row is complete
        if buffer.count(",") >= expected_columns - 1:
            cleaned_lines.append(buffer)
            buffer = ""

# Write cleaned lines to a new file
with open(output_path, 'w', encoding='utf-8') as f:
    for line in cleaned_lines:
        f.write(line + "\n")

In [None]:
import csv

input_path = '/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes.csv'
output_path = '/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes_cleaned.csv'

with open(input_path, 'r', encoding='utf-8', errors='ignore') as infile, \
     open(output_path, 'w', encoding='utf-8', newline='') as outfile:

    reader = csv.reader(infile, delimiter=',', quotechar='"', escapechar='\\')
    writer = csv.writer(outfile)

    for row in reader:
        if len(row) == 12:  # ✅ expected number of columns
            writer.writerow(row)

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/RAW_recipes_cleaned.csv')

In [None]:
print(df2.shape)
df2.head(2).T

In [None]:
df2.info()

In [None]:
import ast

def safe_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        return []

for col in ['tags', 'nutrition', 'steps', 'ingredients']:
    df2[col] = df2[col].apply(safe_eval)

In [None]:
df2['name'] = df2['name'].fillna("Unnamed Recipe")
df2['description'] = df2['description'].fillna("No description available.")
df2['submitted'] = pd.to_datetime(df2['submitted'], errors='coerce')
df2['submitted'] = df2['submitted'].astype(str)

In [None]:
recipes_bot_ready = df2.to_dict(orient='records')

In [None]:
import json
with open("/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_df2_cleaned.json", "w") as f:
    json.dump(recipes_bot_ready, f, indent=2)

In [None]:
import pandas as pd

In [None]:
import json

with open('/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_clean_recipes.json', 'r') as f:
    try:
        data = json.load(f)
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", e)

In [None]:
df1_clean = pd.DataFrame(data)

In [None]:
import json

with open('/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_df2_cleaned.json', 'r') as f:
    try:
        data = json.load(f)
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", e)

df2_clean = pd.DataFrame(data)

In [None]:
df3_clean = pd.read_json('/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_df3_cleaned.json')

In [None]:
df3_clean.columns

In [None]:
df2_clean.columns

In [None]:
df1_clean.columns

In [None]:
df2_clean['name'].head(1)

In [None]:
# Rename columns for consistency
df1_clean = df1_clean.rename(columns={
    'recipe_name': 'name',
    'directions': 'steps',
    'cuisine_path': 'cuisine',
    'timing': 'prep_timing'
})

df2_clean = df2_clean.rename(columns={
    'description': 'desc'
})

df3_clean = df3_clean.rename(columns={
    'NER': 'entities',
    'title': 'name'
})

In [None]:
all_columns = set(df1_clean.columns) | set(df2_clean.columns) | set(df3_clean.columns)

In [None]:
df1_ready = df1_clean.reindex(columns=all_columns)
df2_ready = df2_clean.reindex(columns=all_columns)
df3_ready = df3_clean.reindex(columns=all_columns)

In [None]:
combined_recipes_df = pd.concat([df1_ready, df2_ready, df3_ready], ignore_index=True)

In [None]:
combined_recipes_df.to_json('/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_combined_recipes.json', orient='records', indent=2)

In [None]:
combined_recipes_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/nomnom_combined_recipes.csv', index=False)

In [None]:
combined_recipes_df.columns

In [None]:
!pip install -U sentence-transformers

In [None]:
def prepare_text(row):
    return (
        f"{row.get('name', '')}. "
        f"Description: {row.get('desc', '')}. "
        f"Ingredients: {row.get('ingredients', '')}. "
        f"Steps: {row.get('steps', '')}. "
        f"Tags: {row.get('tags', '')}. "
        f"Cuisine: {row.get('cuisine', '')}."
    )

combined_recipes_df['text_for_embedding'] = combined_recipes_df.apply(prepare_text, axis=1)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
combined_recipes_df.count()

In [None]:
embeddings = model.encode(
    combined_recipes_df['text_for_embedding'].fillna("").tolist(),
    show_progress_bar=True,
    batch_size=64)

In [None]:
sampled_df = combined_recipes_df.sample(n=10000, random_state=42).reset_index(drop=True)
def prepare_text(row):
    return (
        f"{row.get('name', '')}. "
        f"Description: {row.get('desc', '')}. "
        f"Ingredients: {row.get('ingredients', '')}. "
        f"Steps: {row.get('steps', '')}. "
        f"Tags: {row.get('tags', '')}. "
        f"Cuisine: {row.get('cuisine', '')}."
    )

sampled_df['text_for_embedding'] = sampled_df.apply(prepare_text, axis=1)


sample_embeddings = model.encode(sampled_df['text_for_embedding'].fillna(""), show_progress_bar=True)

import pickle
with open("sample_embeddings.pkl", "wb") as f:
    pickle.dump((sample_embeddings, sampled_df), f)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_similar_recipes(query, top_k=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, sample_embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]
    return sampled_df.iloc[top_indices].copy(), similarities[top_indices]

In [None]:
results_df, scores = retrieve_similar_recipes("creamy mushroom pasta")
for i, (idx, row) in enumerate(results_df.iterrows()):
    print(f"--- Recipe #{i+1} (Score: {scores[i]:.4f}) ---")
    print("Name:", row["name"])
    print("Ingredients:", row["ingredients"])
    print("Steps:", row["steps"][:300], "...\n")

In [None]:
with open("recipe_embeddings.pkl", "rb") as f:
    embeddings, combined_recipes_df = pickle.load(f)


In [None]:
!pip install faiss-cpu

In [None]:
import faiss
import numpy as np

dim = sample_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(sample_embeddings).astype('float32'))
faiss.write_index(index, "sample_recipe_index.faiss")

In [None]:
def retrieve_faiss(query, top_k=5):
    query_vector = model.encode([query]).astype('float32')
    distances, indices = index.search(query_vector, top_k)
    return sampled_df.iloc[indices[0]].copy(), distances[0]