In [10]:
!pip install --upgrade pip faker polars 

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m127.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:02[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.1.1
    Uninstalling pip-25.1.1:
      Successfully uninstalled pip-25.1.1
Successfully installed pip-25.2
[0m

In [12]:
# --- Import Libraries ---
import os
from datetime import datetime, timedelta
import time
import random
from faker import Faker
import polars as pl
from collections import defaultdict


FILE_PATH_STORES = "stores.csv"
FILE_PATH_CATEGORIES = "categories.csv"
FILE_PATH_PRODUCTS = "products.csv"
FILE_PATH_TRANSACTIONS = "transactions.csv"


# Initialize Faker with Indonesian locale
fake = Faker("id_ID")
Faker.seed(42) 


N_ROWS_TRANSACTIONS = 1_000_000 
N_CHUNKS = 10
CHUNK_SIZE = N_ROWS_TRANSACTIONS // N_CHUNKS

N_GENERATED_STORES = 50

# Date range for transactions (one month)
START_TRANSACTION_DATE = datetime(2023, 7, 1)
END_TRANSACTION_DATE = datetime(2023, 7, 30)

In [13]:
def generate_stores_data(num_stores: int) -> pl.DataFrame:
    print("Getting static products data...")
    store_data = [
          {"store_id": 1, "store_name": "Jue Coffee Kuningan City", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 2, "store_name": "Jue Coffee Grand Indonesia", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 3, "store_name": "Jue Coffee Senayan City", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 4, "store_name": "Jue Coffee Pondok Indah Mall", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 5, "store_name": "Jue Coffee Gandaria City", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 6, "store_name": "Jue Coffee Pacific Place", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 7, "store_name": "Jue Coffee Kota Kasablanka", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 8, "store_name": "Jue Coffee Lotte Avenue", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 9, "store_name": "Jue Coffee Plaza Senayan", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 10, "store_name": "Jue Coffee Sarinah Thamrin", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 11, "store_name": "Jue Coffee FX Sudirman", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 12, "store_name": "Jue Coffee Central Park", "city_name": "Kota Jakarta Barat"},
          {"store_id": 13, "store_name": "Jue Coffee Taman Anggrek", "city_name": "Kota Jakarta Barat"},
          {"store_id": 14, "store_name": "Jue Coffee Puri Indah Mall", "city_name": "Kota Jakarta Barat"},
          {"store_id": 15, "store_name": "Jue Coffee Mall Kelapa Gading", "city_name": "Kota Jakarta Utara"},
          {"store_id": 16, "store_name": "Jue Coffee Emporium Pluit", "city_name": "Kota Jakarta Utara"},
          {"store_id": 17, "store_name": "Jue Coffee PIK Avenue", "city_name": "Kota Jakarta Utara"},
          {"store_id": 18, "store_name": "Jue Coffee Mall Artha Gading", "city_name": "Kota Jakarta Utara"},
          {"store_id": 19, "store_name": "Jue Coffee One PM", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 20, "store_name": "Jue Coffee SCBD", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 21, "store_name": "Jue Coffee Kemang Village", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 22, "store_name": "Jue Coffee Cilandak Town Square", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 23, "store_name": "Jue Coffee Pejaten Village", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 24, "store_name": "Jue Coffee Bintaro XChange", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 25, "store_name": "Jue Coffee Transpark Bintaro", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 26, "store_name": "Jue Coffee The Breeze BSD", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 27, "store_name": "Jue Coffee Summarecon Mall Serpong", "city_name": "Kabupaten Tangerang"},
          {"store_id": 28, "store_name": "Jue Coffee Living World Alam Sutera", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 29, "store_name": "Jue Coffee Supermal Karawaci", "city_name": "Kabupaten Tangerang"},
          {"store_id": 30, "store_name": "Jue Coffee Tangcity Mall", "city_name": "Kota Tangerang"},
          {"store_id": 31, "store_name": "Jue Coffee AEON Mall BSD", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 32, "store_name": "Jue Coffee IKEA Alam Sutera", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 33, "store_name": "Jue Coffee Teras Kota BSD", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 34, "store_name": "Jue Coffee Ciputat Raya", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 35, "store_name": "Jue Coffee Pamulang", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 36, "store_name": "Jue Coffee ITC BSD", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 37, "store_name": "Jue Coffee Pasar Modern BSD", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 38, "store_name": "Jue Coffee Gading Serpong", "city_name": "Kabupaten Tangerang"},
          {"store_id": 39, "store_name": "Jue Coffee Mall @ Alam Sutera", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 40, "store_name": "Jue Coffee Cikokol", "city_name": "Kota Tangerang"},
          {"store_id": 41, "store_name": "Jue Coffee Graha Raya", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 42, "store_name": "Jue Coffee Modernland Tangerang", "city_name": "Kota Tangerang"},
          {"store_id": 43, "store_name": "Jue Coffee Alam Sutera Boulevard", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 44, "store_name": "Jue Coffee Stasiun Tangerang", "city_name": "Kota Tangerang"},
          {"store_id": 45, "store_name": "Jue Coffee Citra Raya", "city_name": "Kabupaten Tangerang"},
          {"store_id": 46, "store_name": "Jue Coffee Lippo Karawaci", "city_name": "Kabupaten Tangerang"},
          {"store_id": 47, "store_name": "Jue Coffee Daan Mogot Mall", "city_name": "Kota Jakarta Barat"},
          {"store_id": 48, "store_name": "Jue Coffee Puri Kembangan", "city_name": "Kota Jakarta Barat"},
          {"store_id": 49, "store_name": "Jue Coffee Green Lake City", "city_name": "Kota Jakarta Barat"},
          {"store_id": 50, "store_name": "Jue Coffee Kedoya", "city_name": "Kota Jakarta Barat"},
          {"store_id": 51, "store_name": "Jue Coffee Roxy Mas", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 52, "store_name": "Jue Coffee Cempaka Putih", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 53, "store_name": "Jue Coffee Senen", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 54, "store_name": "Jue Coffee Menteng", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 55, "store_name": "Jue Coffee Matraman", "city_name": "Kota Jakarta Timur"},
          {"store_id": 56, "store_name": "Jue Coffee Pramuka", "city_name": "Kota Jakarta Timur"},
          {"store_id": 57, "store_name": "Jue Coffee Arion Mall", "city_name": "Kota Jakarta Timur"},
          {"store_id": 58, "store_name": "Jue Coffee Buaran Plaza", "city_name": "Kota Jakarta Timur"},
          {"store_id": 59, "store_name": "Jue Coffee Klender", "city_name": "Kota Jakarta Timur"},
          {"store_id": 60, "store_name": "Jue Coffee Pondok Kopi", "city_name": "Kota Jakarta Timur"},
          {"store_id": 61, "store_name": "Jue Coffee Duren Sawit", "city_name": "Kota Jakarta Timur"},
          {"store_id": 62, "store_name": "Jue Coffee Kramat Jati", "city_name": "Kota Jakarta Timur"},
          {"store_id": 63, "store_name": "Jue Coffee Cawang", "city_name": "Kota Jakarta Timur"},
          {"store_id": 64, "store_name": "Jue Coffee Halim Perdanakusuma", "city_name": "Kota Jakarta Timur"},
          {"store_id": 65, "store_name": "Jue Coffee Kalimalang", "city_name": "Kota Jakarta Timur"},
          {"store_id": 66, "store_name": "Jue Coffee Jatiwaringin", "city_name": "Kota Bekasi"},
          {"store_id": 67, "store_name": "Jue Coffee Pondok Gede", "city_name": "Kota Bekasi"},
          {"store_id": 68, "store_name": "Jue Coffee Galaxy Bekasi", "city_name": "Kota Bekasi"},
          {"store_id": 69, "store_name": "Jue Coffee Grand Galaxy Park", "city_name": "Kota Bekasi"},
          {"store_id": 70, "store_name": "Jue Coffee Summarecon Mall Bekasi", "city_name": "Kota Bekasi"},
          {"store_id": 71, "store_name": "Jue Coffee Metropolitan Mall Bekasi", "city_name": "Kota Bekasi"},
          {"store_id": 72, "store_name": "Jue Coffee Mega Bekasi Hypermall", "city_name": "Kota Bekasi"},
          {"store_id": 73, "store_name": "Jue Coffee Trans Studio Mall Cibubur", "city_name": "Kota Depok"},
          {"store_id": 74, "store_name": "Jue Coffee Cibubur Junction", "city_name": "Kota Depok"},
          {"store_id": 75, "store_name": "Jue Coffee Margonda Raya", "city_name": "Kota Depok"},
          {"store_id": 76, "store_name": "Jue Coffee Margo City", "city_name": "Kota Depok"},
          {"store_id": 77, "store_name": "Jue Coffee Depok Town Square", "city_name": "Kota Depok"},
          {"store_id": 78, "store_name": "Jue Coffee Cinere Bellevue Mall", "city_name": "Kota Depok"},
          {"store_id": 79, "store_name": "Jue Coffee Sawangan", "city_name": "Kota Depok"},
          {"store_id": 80, "store_name": "Jue Coffee Bojongsari", "city_name": "Kota Depok"},
          {"store_id": 81, "store_name": "Jue Coffee Cimanggis", "city_name": "Kota Depok"},
          {"store_id": 82, "store_name": "Jue Coffee Stasiun Depok Baru", "city_name": "Kota Depok"},
          {"store_id": 83, "store_name": "Jue Coffee Sentul City", "city_name": "Kabupaten Bogor"},
          {"store_id": 84, "store_name": "Jue Coffee Botani Square", "city_name": "Kota Bogor"},
          {"store_id": 85, "store_name": "Jue Coffee Pajajaran Bogor", "city_name": "Kota Bogor"},
          {"store_id": 86, "store_name": "Jue Coffee Stasiun Bogor", "city_name": "Kota Bogor"},
          {"store_id": 87, "store_name": "Jue Coffee Yasmin Bogor", "city_name": "Kota Bogor"},
          {"store_id": 88, "store_name": "Jue Coffee Ciawi", "city_name": "Kabupaten Bogor"},
          {"store_id": 89, "store_name": "Jue Coffee Cibinong City Mall", "city_name": "Kabupaten Bogor"},
          {"store_id": 90, "store_name": "Jue Coffee Gunung Putri", "city_name": "Kabupaten Bogor"},
          {"store_id": 91, "store_name": "Jue Coffee BSD Green Office Park", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 92, "store_name": "Jue Coffee Foresta Business Loft", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 93, "store_name": "Jue Coffee Fatmawati", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 94, "store_name": "Jue Coffee Ampera", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 95, "store_name": "Jue Coffee Radio Dalam", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 96, "store_name": "Jue Coffee Cipete", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 97, "store_name": "Jue Coffee Kebon Jeruk", "city_name": "Kota Jakarta Barat"},
          {"store_id": 98, "store_name": "Jue Coffee Tanjung Duren", "city_name": "Kota Jakarta Barat"},
          {"store_id": 99, "store_name": "Jue Coffee Mangga Besar", "city_name": "Kota Jakarta Barat"},
          {"store_id": 100, "store_name": "Jue Coffee Gajah Mada", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 101, "store_name": "Jue Coffee Pasar Baru", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 102, "store_name": "Jue Coffee Glodok", "city_name": "Kota Jakarta Barat"},
          {"store_id": 103, "store_name": "Jue Coffee Sunter", "city_name": "Kota Jakarta Utara"},
          {"store_id": 104, "store_name": "Jue Coffee Danau Sunter", "city_name": "Kota Jakarta Utara"},
          {"store_id": 105, "store_name": "Jue Coffee Kelapa Gading Boulevard", "city_name": "Kota Jakarta Utara"},
          {"store_id": 106, "store_name": "Jue Coffee Harapan Indah", "city_name": "Kota Bekasi"},
          {"store_id": 107, "store_name": "Jue Coffee Kemayoran", "city_name": "Kota Jakarta Pusat"},
          {"store_id": 108, "store_name": "Jue Coffee PIK 2", "city_name": "Kabupaten Tangerang"},
          {"store_id": 109, "store_name": "Jue Coffee BSD City", "city_name": "Kota Tangerang Selatan"},
          {"store_id": 110, "store_name": "Jue Coffee Karang Tengah", "city_name": "Kota Tangerang"},
          {"store_id": 111, "store_name": "Jue Coffee Ciledug", "city_name": "Kota Tangerang"},
          {"store_id": 112, "store_name": "Jue Coffee Legok", "city_name": "Kabupaten Tangerang"},
          {"store_id": 113, "store_name": "Jue Coffee Tigaraksa", "city_name": "Kabupaten Tangerang"},
          {"store_id": 114, "store_name": "Jue Coffee Cibinong", "city_name": "Kabupaten Bogor"},
          {"store_id": 115, "store_name": "Jue Coffee Ciomas", "city_name": "Kabupaten Bogor"},
          {"store_id": 116, "store_name": "Jue Coffee Sukabumi Utara", "city_name": "Kota Jakarta Barat"},
          {"store_id": 117, "store_name": "Jue Coffee Kalibata City", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 118, "store_name": "Jue Coffee Tebet Raya", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 119, "store_name": "Jue Coffee Pondok Ranggon", "city_name": "Kota Jakarta Timur"},
          {"store_id": 120, "store_name": "Jue Coffee Cipayung", "city_name": "Kota Jakarta Timur"},
          {"store_id": 121, "store_name": "Jue Coffee Kebayoran Lama", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 122, "store_name": "Jue Coffee Pasar Minggu", "city_name": "Kota Jakarta Selatan"},
          {"store_id": 123, "store_name": "Jue Coffee Jatinegara", "city_name": "Kota Jakarta Timur"},
          {"store_id": 124, "store_name": "Jue Coffee Pulogadung", "city_name": "Kota Jakarta Timur"},
          {"store_id": 125, "store_name": "Jue Coffee Cakung", "city_name": "Kota Jakarta Timur"},
          {"store_id": 126, "store_name": "Jue Coffee Rawamangun", "city_name": "Kota Jakarta Timur"},
          {"store_id": 127, "store_name": "Jue Coffee Tanjung Priok", "city_name": "Kota Jakarta Utara"},
          {"store_id": 128, "store_name": "Jue Coffee Koja", "city_name": "Kota Jakarta Utara"},
          {"store_id": 129, "store_name": "Jue Coffee Semper", "city_name": "Kota Jakarta Utara"},
          {"store_id": 130, "store_name": "Jue Coffee Ancol", "city_name": "Kota Jakarta Utara"},
          {"store_id": 131, "store_name": "Jue Coffee Pademangan", "city_name": "Kota Jakarta Utara"}
    ]
    return pl.DataFrame(store_data)

In [14]:

def get_categories_data() -> pl.DataFrame:
    print("Getting static categories data...")
    categories_data = [
        {"category_id": 1, "category_name": "Coffee"},
        {"category_id": 2, "category_name": "Non-Coffee"},
        {"category_id": 3, "category_name": "Snacks"},
        {"category_id": 4, "category_name": "Pastries & Cakes"},
        {"category_id": 5, "category_name": "Breakfast Menu"},
        {"category_id": 6, "category_name": "Lunch & Dinner"},
        {"category_id": 7, "category_name": "Desserts"},
        {"category_id": 8, "category_name": "Merchandise"},
        {"category_id": 9, "category_name": "Brewing Equipment"},
        {"category_id": 10, "category_name": "Packaged Beans"}
    ]
    return pl.DataFrame(categories_data)

In [21]:
def get_products_data() -> pl.DataFrame:
    print("Getting static products data...")
    products_data = [
    {"product_id": 101, "product_name": "Kopi Telur Tradisional", "category_id": 1, "unit_price": 18000, "base_price": 13320},
    {"product_id": 102, "product_name": "Kopi Kelapa Khas Vietnam", "category_id": 1, "unit_price": 22000, "base_price": 14960},
    {"product_id": 103, "product_name": "Kopi Vietnam Drip Original", "category_id": 1, "unit_price": 20000, "base_price": 13600},
    {"product_id": 104, "product_name": "Kopi Butter Gurih", "category_id": 1, "unit_price": 19000, "base_price": 12350},
    {"product_id": 105, "product_name": "Kopi Susu Kampung Kental Manis", "category_id": 1, "unit_price": 15000, "base_price": 10800},
    {"product_id": 106, "product_name": "Kopi Coklat Spesial", "category_id": 1, "unit_price": 21000, "base_price": 15750},
    {"product_id": 107, "product_name": "Es Kopi Susu Aren", "category_id": 1, "unit_price": 23000, "base_price": 16100},
    {"product_id": 108, "product_name": "Es Kopi Hitam Dingin", "category_id": 1, "unit_price": 16000, "base_price": 11040},
    {"product_id": 109, "product_name": "Es Kopi Hitam Lemon Segar", "category_id": 1, "unit_price": 18000, "base_price": 12600},
    {"product_id": 110, "product_name": "Drip Bag Coffee Lokal Blend", "category_id": 1, "unit_price": 25000, "base_price": 17250},
    {"product_id": 111, "product_name": "Espresso Shot", "category_id": 1, "unit_price": 12000, "base_price": 8160},
    {"product_id": 112, "product_name": "Americano Panas", "category_id": 1, "unit_price": 18000, "base_price": 12960},
    {"product_id": 113, "product_name": "Latte Panas", "category_id": 1, "unit_price": 28000, "base_price": 18480},
    {"product_id": 114, "product_name": "Cappuccino Panas", "category_id": 1, "unit_price": 28000, "base_price": 20160},
    {"product_id": 115, "product_name": "Macchiato Panas", "category_id": 1, "unit_price": 28000, "base_price": 18200},
    {"product_id": 116, "product_name": "Mocha Panas", "category_id": 1, "unit_price": 32000, "base_price": 22400},
    {"product_id": 117, "product_name": "Kopi Susu Regal", "category_id": 1, "unit_price": 25000, "base_price": 16250},
    {"product_id": 118, "product_name": "Kopi Pandan Latte", "category_id": 1, "unit_price": 27000, "base_price": 17550},
    {"product_id": 119, "product_name": "Ice Shaken Espresso", "category_id": 1, "unit_price": 29000, "base_price": 21750},
    {"product_id": 120, "product_name": "Cold Brew Black", "category_id": 1, "unit_price": 28000, "base_price": 19040},
    {"product_id": 121, "product_name": "Cold Brew White", "category_id": 1, "unit_price": 32000, "base_price": 20800},
    {"product_id": 122, "product_name": "Affogato", "category_id": 1, "unit_price": 30000, "base_price": 19500},
    {"product_id": 123, "product_name": "Manual Brew V60", "category_id": 1, "unit_price": 35000, "base_price": 22750},
    {"product_id": 124, "product_name": "Manual Brew Aeropress", "category_id": 1, "unit_price": 35000, "base_price": 26250},
    {"product_id": 125, "product_name": "Filter Coffee Seasonal", "category_id": 1, "unit_price": 38000, "base_price": 24700},
    {"product_id": 126, "product_name": "Kopi Hitam Gula Aren", "category_id": 1, "unit_price": 17000, "base_price": 12750},
    {"product_id": 127, "product_name": "Kopi Susu Caramel", "category_id": 1, "unit_price": 26000, "base_price": 18200},
    {"product_id": 128, "product_name": "Kopi Hazelnut Latte", "category_id": 1, "unit_price": 26000, "base_price": 17680},
    {"product_id": 129, "product_name": "Kopi Vanila Latte", "category_id": 1, "unit_price": 26000, "base_price": 18460},
    {"product_id": 130, "product_name": "Jue Coffee Signature Latte", "category_id": 1, "unit_price": 30000, "base_price": 20400},

    {"product_id": 201, "product_name": "Matcha Latte Premium", "category_id": 2, "unit_price": 38000, "base_price": 26600},
    {"product_id": 202, "product_name": "Pure Chocolate Dingin", "category_id": 2, "unit_price": 40000, "base_price": 26000},
    {"product_id": 203, "product_name": "Lemon Tea Segar", "category_id": 2, "unit_price": 25000, "base_price": 17500},
    {"product_id": 204, "product_name": "Red Velvet Latte Creamy", "category_id": 2, "unit_price": 39000, "base_price": 25350},
    {"product_id": 205, "product_name": "Thai Tea Original", "category_id": 2, "unit_price": 28000, "base_price": 21000},
    {"product_id": 206, "product_name": "Green Tea Latte", "category_id": 2, "unit_price": 36000, "base_price": 23400},
    {"product_id": 207, "product_name": "Taro Latte", "category_id": 2, "unit_price": 37000, "base_price": 27380},
    {"product_id": 208, "product_name": "Strawberry Milkshake", "category_id": 2, "unit_price": 35000, "base_price": 24500},
    {"product_id": 209, "product_name": "Cookies & Cream Frappe", "category_id": 2, "unit_price": 42000, "base_price": 27300},
    {"product_id": 210, "product_name": "Virgin Mojito", "category_id": 2, "unit_price": 33000, "base_price": 22440},
    {"product_id": 211, "product_name": "Lychee Tea", "category_id": 2, "unit_price": 29000, "base_price": 20000},
    {"product_id": 212, "product_name": "Peach Tea", "category_id": 2, "unit_price": 29000, "base_price": 18850},
    {"product_id": 213, "product_name": "Hot Chocolate Marshmallow", "category_id": 2, "unit_price": 35000, "base_price": 22750},
    {"product_id": 214, "product_name": "Chai Latte", "category_id": 2, "unit_price": 34000, "base_price": 25160},
    {"product_id": 215, "product_name": "Susu Regal", "category_id": 2, "unit_price": 28000, "base_price": 19600},
    {"product_id": 216, "product_name": "Orange Juice Fresh", "category_id": 2, "unit_price": 30000, "base_price": 21000},
    {"product_id": 217, "product_name": "Jus Alpukat", "category_id": 2, "unit_price": 32000, "base_price": 20800},
    {"product_id": 218, "product_name": "Mineral Water", "category_id": 2, "unit_price": 10000, "base_price": 6800},
    {"product_id": 219, "product_name": "Sparkling Water", "category_id": 2, "unit_price": 15000, "base_price": 9900},
    {"product_id": 220, "product_name": "Ginger Ale", "category_id": 2, "unit_price": 20000, "base_price": 14000},

    {"product_id": 301, "product_name": "French Fries Original", "category_id": 3, "unit_price": 25000, "base_price": 18000},
    {"product_id": 302, "product_name": "Chicken Nugget", "category_id": 3, "unit_price": 28000, "base_price": 19600},
    {"product_id": 303, "product_name": "Mini Spring Rolls", "category_id": 3, "unit_price": 26000, "base_price": 17420},
    {"product_id": 304, "product_name": "Samosa Ayam", "category_id": 3, "unit_price": 27000, "base_price": 17550},
    {"product_id": 305, "product_name": "Edamame Rebus", "category_id": 3, "unit_price": 22000, "base_price": 14300},
    {"product_id": 306, "product_name": "Crispy Mushroom", "category_id": 3, "unit_price": 29000, "base_price": 18850},
    {"product_id": 307, "product_name": "Onion Rings", "category_id": 3, "unit_price": 27000, "base_price": 19440},
    {"product_id": 308, "product_name": "Fish & Chips Bites", "category_id": 3, "unit_price": 35000, "base_price": 22750},
    {"product_id": 309, "product_name": "Nachos Cheese", "category_id": 3, "unit_price": 38000, "base_price": 24700},
    {"product_id": 310, "product_name": "Sweet Potato Fries", "category_id": 3, "unit_price": 28000, "base_price": 18200},

    {"product_id": 401, "product_name": "Butter Croissant", "category_id": 4, "unit_price": 28000, "base_price": 18200},
    {"product_id": 402, "product_name": "Chocolate Croissant", "category_id": 4, "unit_price": 32000, "base_price": 20800},
    {"product_id": 403, "product_name": "Almond Croissant", "category_id": 4, "unit_price": 35000, "base_price": 23450},
    {"product_id": 404, "product_name": "Pain Au Chocolat", "category_id": 4, "unit_price": 30000, "base_price": 19500},
    {"product_id": 405, "product_name": "Cinnamon Roll", "category_id": 4, "unit_price": 29000, "base_price": 21750},
    {"product_id": 406, "product_name": "Banana Bread Slice", "category_id": 4, "unit_price": 25000, "base_price": 17000},
    {"product_id": 407, "product_name": "Red Velvet Cake Slice", "category_id": 4, "unit_price": 45000, "base_price": 29250},
    {"product_id": 408, "product_name": "Chocolate Fudge Cake Slice", "category_id": 4, "unit_price": 45000, "base_price": 31050},
    {"product_id": 409, "product_name": "Blueberry Cheesecake Slice", "category_id": 4, "unit_price": 48000, "base_price": 31200},
    {"product_id": 410, "product_name": "Marble Cake Slice", "category_id": 4, "unit_price": 38000, "base_price": 24700},
    {"product_id": 411, "product_name": "Scones with Jam & Cream", "category_id": 4, "unit_price": 33000, "base_price": 22440},
    {"product_id": 412, "product_name": "Muffin Coklat Chip", "category_id": 4, "unit_price": 27000, "base_price": 19170},
    {"product_id": 413, "product_name": "Muffin Blueberry", "category_id": 4, "unit_price": 27000, "base_price": 17820},
    {"product_id": 414, "product_name": "Donat Gula", "category_id": 4, "unit_price": 18000, "base_price": 12600},
    {"product_id": 415, "product_name": "Cookies Chocochips", "category_id": 4, "unit_price": 22000, "base_price": 15400},

    {"product_id": 501, "product_name": "Classic Omelette", "category_id": 5, "unit_price": 40000, "base_price": 26000},
    {"product_id": 502, "product_name": "Scrambled Eggs on Toast", "category_id": 5, "unit_price": 42000, "base_price": 27300},
    {"product_id": 503, "product_name": "Avocado Toast with Poached Egg", "category_id": 5, "unit_price": 55000, "base_price": 35750},
    {"product_id": 504, "product_name": "Pancakes with Maple Syrup", "category_id": 5, "unit_price": 48000, "base_price": 31200},
    {"product_id": 505, "product_name": "Waffles with Berries", "category_id": 5, "unit_price": 50000, "base_price": 32500},
    {"product_id": 506, "product_name": "Granola Bowl with Yogurt", "category_id": 5, "unit_price": 45000, "base_price": 31050},
    {"product_id": 507, "product_name": "Fruit Platter Fresh", "category_id": 5, "unit_price": 35000, "base_price": 22750},
    {"product_id": 508, "product_name": "Chicken Porridge", "category_id": 5, "unit_price": 38000, "base_price": 27740},

    {"product_id": 601, "product_name": "Spaghetti Aglio Olio", "category_id": 6, "unit_price": 65000, "base_price": 47450},
    {"product_id": 602, "product_name": "Chicken Carbonara Pasta", "category_id": 6, "unit_price": 70000, "base_price": 45500},
    {"product_id": 603, "product_name": "Nasi Goreng Kampung Jue", "category_id": 6, "unit_price": 58000, "base_price": 43500},
    {"product_id": 604, "product_name": "Mie Goreng Tek-Tek", "category_id": 6, "unit_price": 55000, "base_price": 35750},
    {"product_id": 605, "product_name": "Caesar Salad with Grilled Chicken", "category_id": 6, "unit_price": 60000, "base_price": 42000},
    {"product_id": 606, "product_name": "Chicken Katsu Curry Rice", "category_id": 6, "unit_price": 75000, "base_price": 50250},
    {"product_id": 607, "product_name": "Crispy Dory with Tartar Sauce", "category_id": 6, "unit_price": 72000, "base_price": 46800},
    {"product_id": 608, "product_name": "Club Sandwich Classic", "category_id": 6, "unit_price": 55000, "base_price": 37950},
    {"product_id": 609, "product_name": "Beef Burger with Fries", "category_id": 6, "unit_price": 80000, "base_price": 52800},
    {"product_id": 610, "product_name": "Grilled Salmon Steak", "category_id": 6, "unit_price": 95000, "base_price": 61750},

    {"product_id": 701, "product_name": "Ice Cream Scoop (Vanilla)", "category_id": 7, "unit_price": 20000, "base_price": 14000},
    {"product_id": 702, "product_name": "Ice Cream Scoop (Chocolate)", "category_id": 7, "unit_price": 20000, "base_price": 13000},
    {"product_id": 703, "product_name": "Ice Cream Scoop (Strawberry)", "category_id": 7, "unit_price": 20000, "base_price": 14000},
    {"product_id": 704, "product_name": "Jue Coffee Banana Split", "category_id": 7, "unit_price": 40000, "base_price": 26000},
    {"product_id": 705, "product_name": "Molten Lava Cake with Ice Cream", "category_id": 7, "unit_price": 50000, "base_price": 32500},
    {"product_id": 706, "product_name": "Panna Cotta Berries", "category_id": 7, "unit_price": 45000, "base_price": 29250},
    {"product_id": 707, "product_name": "Tiramisu Klasik", "category_id": 7, "unit_price": 48000, "base_price": 33600},
        
    {"product_id": 801, "product_name": "Jue Coffee Tumbler (Small)", "category_id": 8, "unit_price": 95000, "base_price": 61750},
    {"product_id": 802, "product_name": "Jue Coffee Tumbler (Large)", "category_id": 8, "unit_price": 120000, "base_price": 78000},
    {"product_id": 803, "product_name": "Jue Coffee T-Shirt (Size M)", "category_id": 8, "unit_price": 150000, "base_price": 97500},
    {"product_id": 804, "product_name": "Jue Coffee Tote Bag", "category_id": 8, "unit_price": 80000, "base_price": 52000},
    {"product_id": 805, "product_name": "Jue Coffee Mug", "category_id": 8, "unit_price": 75000, "base_price": 48750},

    {"product_id": 901, "product_name": "V60 Dripper (Size 01)", "category_id": 9, "unit_price": 120000, "base_price": 78000},
    {"product_id": 902, "product_name": "Aeropress Kit", "category_id": 9, "unit_price": 450000, "base_price": 315000},
    {"product_id": 903, "product_name": "French Press (350ml)", "category_id": 9, "unit_price": 180000, "base_price": 126000},
    {"product_id": 904, "product_name": "Pour Over Kettle", "category_id": 9, "unit_price": 300000, "base_price": 210000},
    {"product_id": 905, "product_name": "Manual Grinder", "category_id": 9, "unit_price": 250000, "base_price": 162500},

    {"product_id": 1001, "product_name": "Jue Coffee House Blend (250g)", "category_id": 10, "unit_price": 85000, "base_price": 58650},
    {"product_id": 1002, "product_name": "Single Origin Arabica Gayo (250g)", "category_id": 10, "unit_price": 120000, "base_price": 84000},
    {"product_id": 1003, "product_name": "Single Origin Robusta Lampung (250g)", "category_id": 10, "unit_price": 75000, "base_price": 48750},
    {"product_id": 1004, "product_name": "Decaf Blend (250g)", "category_id": 10, "unit_price": 90000, "base_price": 60300},
    {"product_id": 1005, "product_name": "Kopi Susu Blend (500g)", "category_id": 10, "unit_price": 150000, "base_price": 105000}

    ]
    return pl.DataFrame(products_data)

In [22]:
def generate_transactions_data(
    num_rows: int,
    stores_df: pl.DataFrame,
    products_df: pl.DataFrame,
    output_csv_path: str
):
    if os.path.exists(output_csv_path) and pl.read_csv(output_csv_path).shape[0] == num_rows:
        print(f"File '{output_csv_path}' already exists with {num_rows} records. Skipping transaction data generation.")
        return

    print(f"Generating {num_rows} transaction records with fluctuating sales...")

    store_ids = stores_df["store_id"].to_list()
    product_ids = products_df["product_id"].to_list()
    product_id_to_price = {row["product_id"]: row["unit_price"] for row in products_df.iter_rows(named=True)}
    all_transactions_data = []

    # --- Logika untuk fluktuasi data ---
    daily_demand_multiplier = defaultdict(lambda: 1.0)
    for day_of_week in [5, 6]:  # Sabtu (5) dan Minggu (6)
        daily_demand_multiplier[day_of_week] = 1.5

    # Simulasikan pola permintaan musiman (misalnya, hari libur)
    peak_sales_dates = {
        (START_TRANSACTION_DATE.date() + timedelta(days=d)): 2.0 for d in range(10, 15)
    }

    # Persiapan daftar tanggal dan bobot untuk sampling
    dates_in_range = [START_TRANSACTION_DATE + timedelta(days=d) for d in range((END_TRANSACTION_DATE - START_TRANSACTION_DATE).days + 1)]
    date_weights = []
    for date in dates_in_range:
        multiplier = daily_demand_multiplier[date.weekday()]
        if date.date() in peak_sales_dates:
            multiplier *= peak_sales_dates[date.date()]
        date_weights.append(multiplier)
    
    # --- Akhir logika fluktuasi ---

    for i in range(N_CHUNKS):
        chunk_data = []
        for _ in range(CHUNK_SIZE):
            transaction_id = fake.uuid4()
            
            # Pilih tanggal secara acak dengan bobot
            transaction_date = random.choices(dates_in_range, weights=date_weights, k=1)[0]

            chosen_store_id = random.choice(store_ids)
            customer_id = None
            chosen_product_id = random.choice(product_ids)
            unit_price = product_id_to_price[chosen_product_id]

            quantity = random.choices([1, 2, 3, 4, 5], weights=[0.6, 0.2, 0.1, 0.05, 0.05])[0]
            payment_method = random.choice(['Cash', 'QRIS', 'Ovo', 'Gopay', 'ShopeePay', 'DANA', 'Debit card', 'Credit card'])
            total_price = quantity * unit_price

            # Introduce price discrepancies
            if random.random() < 0.1:
                total_price = int(total_price * random.uniform(0.9, 1.1))

            chunk_data.append({
                "transaction_id": transaction_id,
                # Ganti 'datetime' menjadi 'date'
                "date": transaction_date.strftime("%Y-%m-%d"),
                "store_id": chosen_store_id,
                "customer_id": customer_id,
                "product_id": chosen_product_id,
                "quantity": quantity,
                "payment_method": payment_method,
                "price": total_price
            })
            
        # Introduce duplicate transactions in a chunk
        if i > 0 and random.random() < 0.1:
            num_duplicates = random.randint(1, 5)
            if len(all_transactions_data) > num_duplicates:
                chunk_data.extend(random.sample(all_transactions_data, num_duplicates))

        all_transactions_data.extend(chunk_data)

    df_transactions = pl.DataFrame(all_transactions_data)
    df_transactions.write_csv(output_csv_path)
    print(f"Transaction data saved to {output_csv_path}")

In [23]:

total_start_time = time.perf_counter()

if not os.path.exists(FILE_PATH_STORES):
    stores_df = generate_stores_data(N_GENERATED_STORES)
    stores_df.write_csv(FILE_PATH_STORES)
    print(f"Stores data saved to {FILE_PATH_STORES}")
else:
    print(f"File '{FILE_PATH_STORES}' already exists. Loading existing data.")
    stores_df = pl.read_csv(FILE_PATH_STORES)

if not os.path.exists(FILE_PATH_CATEGORIES):
    categories_df = get_categories_data()
    categories_df.write_csv(FILE_PATH_CATEGORIES)
    print(f"Categories data saved to {FILE_PATH_CATEGORIES}")
else:
    print(f"File '{FILE_PATH_CATEGORIES}' already exists. Loading existing data.")
    categories_df = pl.read_csv(FILE_PATH_CATEGORIES)

if not os.path.exists(FILE_PATH_PRODUCTS):
    products_df = get_products_data()
    products_df.write_csv(FILE_PATH_PRODUCTS)
    print(f"Products data saved to {FILE_PATH_PRODUCTS}")
else:
    print(f"File '{FILE_PATH_PRODUCTS}' already exists. Loading existing data.")
    products_df = pl.read_csv(FILE_PATH_PRODUCTS)

generate_transactions_data(N_ROWS_TRANSACTIONS, stores_df, products_df, FILE_PATH_TRANSACTIONS)
 
total_stop_time = time.perf_counter()
elapsed_time = total_stop_time - total_start_time
print(f"\nTotal data generation and saving took {elapsed_time:.2f} seconds for {N_ROWS_TRANSACTIONS} transactions.")

File 'stores.csv' already exists. Loading existing data.
File 'categories.csv' already exists. Loading existing data.
File 'products.csv' already exists. Loading existing data.
File 'transactions.csv' already exists with 1000000 records. Skipping transaction data generation.

Total data generation and saving took 0.77 seconds for 1000000 transactions.


In [7]:
%%sql

SHOW DATABASES

25/08/22 08:53:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


namespace
brz_coffeeshop_db
brz_hospital_db
coffeeshop
gld_coffeeshop_db
gld_hospital_db
slv_coffeeshop_db
slv_hospital_db
coffeeshop_medalion


In [3]:
%%sql

create database if not exists coffeeshop;

In [1]:
%%sql 

use coffeeshop

25/08/24 04:15:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
%%sql
    
show tables

namespace,tableName,isTemporary
coffeeshop,categories,False
coffeeshop,products,False
coffeeshop,stores,False
coffeeshop,transactions,False


In [9]:
%%sql

drop table stores

Py4JJavaError: An error occurred while calling o36.sql.
: org.apache.iceberg.exceptions.ServiceFailureException: Server error: UncheckedSQLException: Failed to execute: DELETE FROM iceberg_tables WHERE catalog_name = ? AND table_namespace  = ? AND table_name = ? AND (iceberg_type = 'TABLE' OR iceberg_type IS NULL)
	at org.apache.iceberg.rest.ErrorHandlers$DefaultErrorHandler.accept(ErrorHandlers.java:241)
	at org.apache.iceberg.rest.ErrorHandlers$TableErrorHandler.accept(ErrorHandlers.java:123)
	at org.apache.iceberg.rest.ErrorHandlers$TableErrorHandler.accept(ErrorHandlers.java:107)
	at org.apache.iceberg.rest.HTTPClient.throwFailure(HTTPClient.java:215)
	at org.apache.iceberg.rest.HTTPClient.execute(HTTPClient.java:299)
	at org.apache.iceberg.rest.BaseHTTPClient.delete(BaseHTTPClient.java:55)
	at org.apache.iceberg.rest.RESTSessionCatalog.dropTable(RESTSessionCatalog.java:304)
	at org.apache.iceberg.catalog.BaseSessionCatalog$AsCatalog.dropTable(BaseSessionCatalog.java:112)
	at org.apache.iceberg.rest.RESTCatalog.dropTable(RESTCatalog.java:212)
	at org.apache.iceberg.CachingCatalog.dropTable(CachingCatalog.java:174)
	at org.apache.iceberg.spark.SparkCatalog.dropTableWithoutPurging(SparkCatalog.java:389)
	at org.apache.iceberg.spark.SparkCatalog.dropTable(SparkCatalog.java:354)
	at org.apache.spark.sql.execution.datasources.v2.DropTableExec.run(DropTableExec.scala:38)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


In [17]:
%%sql

drop table categories

In [18]:
%%sql

drop table products

In [13]:
%%sql

drop table transactions

In [20]:
%%sql

CREATE TABLE IF NOT EXISTS categories (
    category_id     STRING,         
    category_name   STRING          
)
USING iceberg;

In [21]:
%%sql

CREATE TABLE IF NOT EXISTS categories (
    category_id     STRING,         
    category_name   STRING          
)
USING iceberg;

In [22]:
%%sql

CREATE TABLE IF NOT EXISTS products (
    product_id      STRING,        
    product_name    STRING,
    category_id     STRING,        
    unit_price      STRING,        
    base_price      STRING         
)
USING iceberg;

In [14]:
%%sql

CREATE TABLE IF NOT EXISTS transactions (
    transaction_id  STRING,
    datetime        TIMESTAMP,     
    store_id        STRING,        
    customer_id     STRING,        
    product_id      STRING,        
    quantity        STRING,        
    payment_method  STRING,        
    price           STRING        
)
USING iceberg
PARTITIONED BY (days(datetime));

In [24]:
stores_csv_path = "stores.csv" 
stores_df = spark.read.csv(stores_csv_path, header=True, inferSchema=True)

print(f"Schema for {stores_csv_path}:")
stores_df.printSchema()
print(f"First 5 rows from {stores_csv_path}:")
stores_df.show(5)


stores_df.write.format("iceberg").mode("overwrite").saveAsTable("stores")

print(f"Data from {stores_csv_path} has been successfully loaded into the 'stores' Iceberg table.")

Schema for stores.csv:
root
 |-- store_id: integer (nullable = true)
 |-- store_name: string (nullable = true)
 |-- city_name: string (nullable = true)

First 5 rows from stores.csv:
+--------+--------------------+--------------------+
|store_id|          store_name|           city_name|
+--------+--------------------+--------------------+
|       1|Jue Coffee Kuning...|Kota Jakarta Selatan|
|       2|Jue Coffee Grand ...|  Kota Jakarta Pusat|
|       3|Jue Coffee Senaya...|  Kota Jakarta Pusat|
|       4|Jue Coffee Pondok...|Kota Jakarta Selatan|
|       5|Jue Coffee Gandar...|Kota Jakarta Selatan|
+--------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Data from stores.csv has been successfully loaded into the 'stores' Iceberg table.


In [25]:
# Membaca data categories.csv
categories_csv_path = "categories.csv" 
categories_df = spark.read.csv(categories_csv_path, header=True, inferSchema=True)

print(f"Schema for {categories_csv_path}:")
categories_df.printSchema()
print(f"First 5 rows from {categories_csv_path}:")
categories_df.show(5)

# Menulis data ke tabel Iceberg 'categories'
categories_df.write.format("iceberg").mode("overwrite").saveAsTable("categories")

print(f"Data from {categories_csv_path} has been successfully loaded into the 'categories' Iceberg table.")

Schema for categories.csv:
root
 |-- category_id: integer (nullable = true)
 |-- category_name: string (nullable = true)

First 5 rows from categories.csv:
+-----------+----------------+
|category_id|   category_name|
+-----------+----------------+
|          1|          Coffee|
|          2|      Non-Coffee|
|          3|          Snacks|
|          4|Pastries & Cakes|
|          5|  Breakfast Menu|
+-----------+----------------+
only showing top 5 rows

Data from categories.csv has been successfully loaded into the 'categories' Iceberg table.


In [26]:
# Membaca data products.csv
products_csv_path = "products.csv" # Sesuaikan path jika berbeda
products_df = spark.read.csv(products_csv_path, header=True, inferSchema=True)

print(f"Schema for {products_csv_path}:")
products_df.printSchema()
print(f"First 5 rows from {products_csv_path}:")
products_df.show(5, truncate=False) # truncate=False untuk melihat nama produk yang panjang

# Menulis data ke tabel Iceberg 'products'
products_df.write.format("iceberg").mode("overwrite").saveAsTable("products")

print(f"Data from {products_csv_path} has been successfully loaded into the 'products' Iceberg table.")

Schema for products.csv:
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- unit_price: integer (nullable = true)
 |-- base_price: integer (nullable = true)

First 5 rows from products.csv:
+----------+------------------------------+-----------+----------+----------+
|product_id|product_name                  |category_id|unit_price|base_price|
+----------+------------------------------+-----------+----------+----------+
|101       |Kopi Telur Tradisional        |1          |18000     |13320     |
|102       |Kopi Kelapa Khas Vietnam      |1          |22000     |14960     |
|103       |Kopi Vietnam Drip Original    |1          |20000     |13600     |
|104       |Kopi Butter Gurih             |1          |19000     |12350     |
|105       |Kopi Susu Kampung Kental Manis|1          |15000     |10800     |
+----------+------------------------------+-----------+----------+----------+
only showing top 5

In [17]:
# Membaca data transactions.csv
transactions_csv_path = "transactions.csv" # Sesuaikan path jika berbeda

# 'inferSchema=True' akan mengenali kolom 'date' sebagai tipe data String.
transactions_df = spark.read.csv(transactions_csv_path, header=True, inferSchema=True)

print(f"Original Schema for {transactions_csv_path}:")
transactions_df.printSchema()
print(f"First 5 rows from {transactions_csv_path}:")
transactions_df.show(5, truncate=False)

# Konversi kolom 'date' ke tipe data Date atau Timestamp jika diperlukan
from pyspark.sql.functions import col, to_date

# Ubah 'datetime' menjadi 'date' di baris ini
# Kita bisa langsung mengonversinya ke tipe Date, karena tidak ada komponen waktu
transactions_df = transactions_df.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

print(f"\nSchema after casting date for {transactions_csv_path}:")
transactions_df.printSchema()

# Menulis data ke tabel Iceberg 'transactions'
transactions_df.write.format("iceberg").mode("overwrite").saveAsTable("transactions")

print(f"Data from {transactions_csv_path} has been successfully loaded into the 'transactions' Iceberg table.")

Original Schema for transactions.csv:
root
 |-- transaction_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- price: integer (nullable = true)

First 5 rows from transactions.csv:
+------------------------------------+----------+--------+-----------+----------+--------+--------------+-----+
|transaction_id                      |date      |store_id|customer_id|product_id|quantity|payment_method|price|
+------------------------------------+----------+--------+-----------+----------+--------+--------------+-----+
|bdd640fb-0667-4ad1-9c80-317fa3b1799d|2023-07-01|71      |NULL       |202       |1       |Debit card    |40000|
|1a3d1fa7-bc89-40a9-a3b8-c1e9392456de|2023-07-01|23      |NULL       |501       |1       |DANA          |40000|
|17fc695a-07a0-4a6e-8822-e8f

25/08/22 09:00:27 ERROR Executor: Exception in task 5.0 in stage 17.0 (TID 127)
org.apache.iceberg.exceptions.ValidationException: Cannot find source column for partition field: 1000: datetime_day: day(2)
	at org.apache.iceberg.exceptions.ValidationException.check(ValidationException.java:49)
	at org.apache.iceberg.PartitionSpec.checkCompatibility(PartitionSpec.java:636)
	at org.apache.iceberg.PartitionSpec$Builder.build(PartitionSpec.java:617)
	at org.apache.iceberg.UnboundPartitionSpec.bind(UnboundPartitionSpec.java:46)
	at org.apache.iceberg.PartitionSpecParser.fromJson(PartitionSpecParser.java:71)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$1(PartitionSpecParser.java:88)
	at org.apache.iceberg.util.JsonUtil.parse(JsonUtil.java:104)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$2(PartitionSpecParser.java:88)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
	at java.

Py4JJavaError: An error occurred while calling o176.saveAsTable.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 17.0 failed 1 times, most recent failure: Lost task 5.0 in stage 17.0 (TID 127) (f8cc73c689d8 executor driver): org.apache.iceberg.exceptions.ValidationException: Cannot find source column for partition field: 1000: datetime_day: day(2)
	at org.apache.iceberg.exceptions.ValidationException.check(ValidationException.java:49)
	at org.apache.iceberg.PartitionSpec.checkCompatibility(PartitionSpec.java:636)
	at org.apache.iceberg.PartitionSpec$Builder.build(PartitionSpec.java:617)
	at org.apache.iceberg.UnboundPartitionSpec.bind(UnboundPartitionSpec.java:46)
	at org.apache.iceberg.PartitionSpecParser.fromJson(PartitionSpecParser.java:71)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$1(PartitionSpecParser.java:88)
	at org.apache.iceberg.util.JsonUtil.parse(JsonUtil.java:104)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$2(PartitionSpecParser.java:88)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
	at java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
	at org.apache.iceberg.PartitionSpecParser.fromJson(PartitionSpecParser.java:86)
	at org.apache.iceberg.SerializableTable.lambda$specs$1(SerializableTable.java:223)
	at java.base/java.util.HashMap.forEach(HashMap.java:1421)
	at org.apache.iceberg.SerializableTable.specs(SerializableTable.java:221)
	at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:674)
	at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:668)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:441)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:621)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:624)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2898)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2834)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2833)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2833)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1253)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3102)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3036)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3025)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:995)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:390)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:364)
	at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.writeWithV2(WriteToDataSourceV2Exec.scala:248)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:342)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:341)
	at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.run(WriteToDataSourceV2Exec.scala:248)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.$anonfun$writeToTable$1(WriteToDataSourceV2Exec.scala:587)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable(WriteToDataSourceV2Exec.scala:579)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable$(WriteToDataSourceV2Exec.scala:572)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.writeToTable(WriteToDataSourceV2Exec.scala:186)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:221)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:645)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:575)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.iceberg.exceptions.ValidationException: Cannot find source column for partition field: 1000: datetime_day: day(2)
	at org.apache.iceberg.exceptions.ValidationException.check(ValidationException.java:49)
	at org.apache.iceberg.PartitionSpec.checkCompatibility(PartitionSpec.java:636)
	at org.apache.iceberg.PartitionSpec$Builder.build(PartitionSpec.java:617)
	at org.apache.iceberg.UnboundPartitionSpec.bind(UnboundPartitionSpec.java:46)
	at org.apache.iceberg.PartitionSpecParser.fromJson(PartitionSpecParser.java:71)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$1(PartitionSpecParser.java:88)
	at org.apache.iceberg.util.JsonUtil.parse(JsonUtil.java:104)
	at org.apache.iceberg.PartitionSpecParser.lambda$fromJson$2(PartitionSpecParser.java:88)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
	at java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1916)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
	at org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
	at org.apache.iceberg.PartitionSpecParser.fromJson(PartitionSpecParser.java:86)
	at org.apache.iceberg.SerializableTable.lambda$specs$1(SerializableTable.java:223)
	at java.base/java.util.HashMap.forEach(HashMap.java:1421)
	at org.apache.iceberg.SerializableTable.specs(SerializableTable.java:221)
	at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:674)
	at org.apache.iceberg.spark.source.SparkWrite$WriterFactory.createWriter(SparkWrite.java:668)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:441)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:621)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:624)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


In [18]:
# Di Spark SQL, hapus tabel yang ada
spark.sql("DROP TABLE IF EXISTS transactions;")

# Kemudian, jalankan kembali kode PySpark Anda
transactions_df.write.format("iceberg").mode("overwrite").saveAsTable("transactions")

                                                                                

In [28]:
%%sql

select * from categories

category_id,category_name
1,Coffee
2,Non-Coffee
3,Snacks
4,Pastries & Cakes
5,Breakfast Menu
6,Lunch & Dinner
7,Desserts
8,Merchandise
9,Brewing Equipment
10,Packaged Beans


In [29]:
%%sql

select * from stores

store_id,store_name,city_name
1,Jue Coffee Kuningan City,Kota Jakarta Selatan
2,Jue Coffee Grand Indonesia,Kota Jakarta Pusat
3,Jue Coffee Senayan City,Kota Jakarta Pusat
4,Jue Coffee Pondok Indah Mall,Kota Jakarta Selatan
5,Jue Coffee Gandaria City,Kota Jakarta Selatan
6,Jue Coffee Pacific Place,Kota Jakarta Selatan
7,Jue Coffee Kota Kasablanka,Kota Jakarta Selatan
8,Jue Coffee Lotte Avenue,Kota Jakarta Selatan
9,Jue Coffee Plaza Senayan,Kota Jakarta Pusat
10,Jue Coffee Sarinah Thamrin,Kota Jakarta Pusat


In [30]:
%%sql

select * from products

product_id,product_name,category_id,unit_price,base_price
101,Kopi Telur Tradisional,1,18000,13320
102,Kopi Kelapa Khas Vietnam,1,22000,14960
103,Kopi Vietnam Drip Original,1,20000,13600
104,Kopi Butter Gurih,1,19000,12350
105,Kopi Susu Kampung Kental Manis,1,15000,10800
106,Kopi Coklat Spesial,1,21000,15750
107,Es Kopi Susu Aren,1,23000,16100
108,Es Kopi Hitam Dingin,1,16000,11040
109,Es Kopi Hitam Lemon Segar,1,18000,12600
110,Drip Bag Coffee Lokal Blend,1,25000,17250


In [2]:
%%sql

select * from transactions

                                                                                

transaction_id,date,store_id,customer_id,product_id,quantity,payment_method,price
bdd640fb-0667-4ad1-9c80-317fa3b1799d,2023-07-01,71,,202,1,Debit card,40000
1a3d1fa7-bc89-40a9-a3b8-c1e9392456de,2023-07-01,23,,501,1,DANA,40000
17fc695a-07a0-4a6e-8822-e8f36c031199,2023-07-01,56,,130,1,QRIS,30000
8fadc1a6-06cb-4fb3-9a1d-e644815ef6d1,2023-07-01,51,,609,1,DANA,80000
6b65a6a4-8b81-48f6-b38a-088ca65ed389,2023-07-01,57,,308,1,Ovo,35000
de8a774b-cf36-458b-8737-819096da1dac,2023-07-01,2,,705,1,Gopay,50000
6c307511-b2b9-437a-a8df-6ec4ce4a2bbd,2023-07-01,88,,206,1,DANA,36000
c37459ee-f50b-4a63-b71e-cd7b27cd8130,2023-07-01,87,,114,1,DANA,28000
5be6128e-18c2-4797-a142-ea7d17be3111,2023-07-01,89,,503,1,Debit card,55000
bacfb3d0-0b1f-4163-8e9f-f57f43b7a3a6,2023-07-01,118,,409,2,Gopay,96000
