# data generation

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, DateType
import datetime
import random


# --- Load your real meal dataset ---
food_df = spark.read.csv("./food_nutrition_dataset.csv", header=True, inferSchema=True)

# Expected columns: e.g., ["food_name", "meal_type", "calories", "protein", ...]
# If your column names differ, adjust below
meal_types = ["Breakfast", "Lunch", "Dinner", "Snack"]

# --- Define realistic water intake ranges (ml) per meal type ---
water_ranges = {
    "Breakfast": (250, 500),
    "Lunch": (400, 700),
    "Dinner": (400, 700),
    "Snack": (150, 300)
}

# --- Parameters ---
start_date = datetime.date(2025, 11, 1)
end_date = datetime.date(2025, 11, 30)
num_users = 5  # adjustable

# --- Convert food_df to Python lists for random sampling ---
food_by_type = {}
for m in meal_types:
    food_by_type[m] = (
        food_df.filter(food_df["meal_type"] == m)
        .select("food_item")
        .rdd.flatMap(lambda x: x)
        .collect()
    )

# --- Generate dummy data ---
rows = []
users_ids = [5577150313,5553957443,4020332650]
for user_idx in users_ids:
    user_id = f"{user_idx}"
    height = round(random.uniform(1.65, 1.85), 2)
    base_weight = round(random.uniform(60, 85), 1)

    for day_offset in range((end_date - start_date).days + 1):
        day = start_date + datetime.timedelta(days=day_offset)
        sport_available = bool(random.choice([True, False]))
        weight_today = round(base_weight + random.uniform(-1, 1), 1)
        sleep_hours = round(random.uniform(4.5, 9.0), 1)

        for meal_type in meal_types:
            foods = food_by_type.get(meal_type, [])
            food_choice = random.choice(foods) if foods else f"Random {meal_type}"
            water_ml = random.uniform(*water_ranges[meal_type])
            rows.append((user_id, day, meal_type, food_choice, water_ml,
                         sport_available, weight_today, height, sleep_hours))


schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("date", DateType(), True),
    StructField("meal_type", StringType(), True),
    StructField("food_name", StringType(), True),
    StructField("water_ml", FloatType(), True),
    StructField("sport_available", BooleanType(), True),
    StructField("weight_kg", FloatType(), True),
    StructField("height_m", FloatType(), True),
    StructField("sleep_hours", FloatType(), True)
])
plan_rows = []
for user_idx in users_ids:
    user_id = f"{user_idx}"

    # base daily targets (adjustable per user)
    base_targets = {
        "Calories": float(random.randint(2000, 2700)),
        "Protein": float(random.randint(90, 130)),
        "Carbohydrates": float(random.randint(250, 400)),
        "Fat": float(random.randint(60, 90)),
        "Fiber": float(random.randint(25, 35)),
        "Sugars": float(random.randint(40, 70)),
        "Sodium": float(random.randint(1500, 2500)),
        "Cholesterol": float(random.randint(200, 300)),
        "Water_Intake": float(random.randint(2000, 3000)),
        "Sleep": float(random.uniform(6.5, 8.0)),
        "Steps": float(random.randint(7000, 12000)),
        "Calories_Burnt": float(random.randint(1800, 2500))
    }

    for day_offset in range((end_date - start_date).days + 1):
        day = start_date + datetime.timedelta(days=day_offset)

        # small day-to-day variation
        row = (
            user_id,
            day,
           float(base_targets["Calories"] + random.randint(-100, 100)),
            float(base_targets["Protein"] + random.uniform(-5, 5)),
            float(base_targets["Carbohydrates"] + random.uniform(-20, 20)),
            float(base_targets["Fat"] + random.uniform(-5, 5)),
            float(base_targets["Fiber"] + random.uniform(-2, 2)),
            float(base_targets["Sugars"] + random.uniform(-5, 5)),
            float(base_targets["Sodium"] + random.uniform(-100, 100)),
            float(base_targets["Cholesterol"] + random.uniform(-20, 20)),
            float(base_targets["Water_Intake"] + random.uniform(-150, 150)),
            float(base_targets["Sleep"] + random.uniform(-0.5, 0.5)),
            float(base_targets["Steps"] + random.randint(-1000, 1000)),
            float(base_targets["Calories_Burnt"] + random.randint(-150, 150))
        )

        plan_rows.append(row)

schema_plan = StructType([
    StructField("user_id", StringType(), True),
    StructField("date", DateType(), True),
    StructField("total_Calories (kcal)", FloatType(), True),
    StructField("total_Protein (g)", FloatType(), True),
    StructField("total_Carbohydrates (g)", FloatType(), True),
    StructField("total_Fat (g)", FloatType(), True),
    StructField("total_Fiber (g)", FloatType(), True),
    StructField("total_Sugars (g)", FloatType(), True),
    StructField("total_Sodium (mg)", FloatType(), True),
    StructField("total_Cholesterol (mg)", FloatType(), True),
    StructField("total_Water_Intake (ml)", FloatType(), True),
    StructField("sleep_hours", FloatType(), True),
    StructField("target_steps", FloatType(), True),
    StructField("target_calories_burnt", FloatType(), True),
])

df_monthly_plan = spark.createDataFrame(plan_rows, schema_plan)
# --- Create DataFrame ---
df_daily_meals = spark.createDataFrame(rows, schema)

# --- Show sample ---
df_daily_meals.show(10, truncate=False)
df_monthly_plan.show(10)


                                                                                

+----------+----------+---------+------------+---------+---------------+---------+--------+-----------+
|user_id   |date      |meal_type|food_name   |water_ml |sport_available|weight_kg|height_m|sleep_hours|
+----------+----------+---------+------------+---------+---------------+---------+--------+-----------+
|5577150313|2025-11-01|Breakfast|Oats        |434.5987 |false          |65.9     |1.83    |8.0        |
|5577150313|2025-11-01|Lunch    |Nuts        |548.94495|false          |65.9     |1.83    |8.0        |
|5577150313|2025-11-01|Dinner   |Spinach     |652.39   |false          |65.9     |1.83    |8.0        |
|5577150313|2025-11-01|Snack    |Salmon      |254.29272|false          |65.9     |1.83    |8.0        |
|5577150313|2025-11-02|Breakfast|Apple       |334.9065 |false          |66.7     |1.83    |7.3        |
|5577150313|2025-11-02|Lunch    |Nuts        |533.6832 |false          |66.7     |1.83    |7.3        |
|5577150313|2025-11-02|Dinner   |Salmon      |441.9292 |false   

In [4]:
import logging
import time
import datetime
import decimal
import json
from google.cloud import firestore
from google.api_core.exceptions import GoogleAPICallError, RetryError
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, DateType
import random

# ----------------------------
# 1. Logging Setup
# ----------------------------
fs_logger = logging.getLogger("firestore-writer")
fs_logger.setLevel(logging.INFO)
if not fs_logger.hasHandlers():
    h = logging.StreamHandler()
    h.setFormatter(logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s"))
    fs_logger.addHandler(h)


# ----------------------------
# 2. Helper Functions
# ----------------------------
def _serialize_value(v):
    if v is None:
        return None
    if isinstance(v, (datetime.datetime, datetime.date)):
        return v.isoformat()
    if isinstance(v, decimal.Decimal):
        return float(v)
    if isinstance(v, (bytes, bytearray)):
        try:
            return v.decode()
        except Exception:
            return str(v)
    try:
        json.dumps(v)
        return v
    except Exception:
        return str(v)


def _commit_with_retries(batch_obj, max_retries=3, base_backoff=0.5):
    attempt = 0
    while True:
        try:
            batch_obj.commit()
            return
        except (GoogleAPICallError, RetryError, IOError) as e:
            attempt += 1
            if attempt > max_retries:
                fs_logger.exception("Firestore commit failed after %d attempts", attempt - 1)
                raise
            backoff = base_backoff * (2 ** (attempt - 1))
            fs_logger.warning(
                "Transient error committing firestore batch (attempt %d). Backing off %.2fs. Error: %s",
                attempt, backoff, str(e)
            )
            time.sleep(backoff)


def make_firestore_writer(collection_name, firestore_client, batch_size=500, max_retries=3):
    def write_batch_to_firestore(batch_df, epoch_id=None):
        rows = batch_df.count()
        if not rows:
            fs_logger.info("[epoch %s] empty, skipping collection=%s", str(epoch_id), collection_name)
            return

        fs_logger.info("[epoch %s] writing %s rows to Firestore collection '%s'",
                       str(epoch_id), rows, collection_name)

        docs_written = 0
        ops_in_current_batch = 0
        fs_batch = firestore_client.batch()

        for row in batch_df.toLocalIterator():
            data = row.asDict(recursive=True)
            user_id = data.get("user_id")

            if not user_id:
                fs_logger.warning("[epoch %s] skipping row without user_id", str(epoch_id))
                continue

            for k, v in list(data.items()):
                data[k] = _serialize_value(v)

            doc_ref = (
                firestore_client.collection("users")
                .document(str(user_id))
                .collection(collection_name)
                .document()
            )

            fs_batch.set(doc_ref, data)
            ops_in_current_batch += 1

            if ops_in_current_batch >= batch_size:
                _commit_with_retries(fs_batch, max_retries=max_retries)
                docs_written += ops_in_current_batch
                ops_in_current_batch = 0
                fs_batch = firestore_client.batch()

        if ops_in_current_batch > 0:
            _commit_with_retries(fs_batch, max_retries=max_retries)
            docs_written += ops_in_current_batch

        fs_logger.info("[epoch %s] wrote %d docs under users/*/%s/",
                       str(epoch_id), docs_written, collection_name)
    return write_batch_to_firestore



In [5]:
writer = make_firestore_writer("daily_meals", db)

# Run the batch write
writer(df_daily_meals, epoch_id="2025-11-07")
writer = make_firestore_writer("monthly_plan", db)
writer(df_monthly_plan, epoch_id="2025-11-07")

fs_logger.info("✅ Finished writing dummy data to Firestore.")

[INFO] 2025-11-10 13:43:06,499 - [epoch 2025-11-07] writing 360 rows to Firestore collection 'daily_meals'
[INFO] 2025-11-10 13:43:57,187 - [epoch 2025-11-07] wrote 360 docs under users/*/daily_meals/
[INFO] 2025-11-10 13:43:57,511 - [epoch 2025-11-07] writing 90 rows to Firestore collection 'monthly_plan'
[INFO] 2025-11-10 13:43:59,106 - [epoch 2025-11-07] wrote 90 docs under users/*/monthly_plan/
[INFO] 2025-11-10 13:43:59,106 - ✅ Finished writing dummy data to Firestore.
