In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
import random
import time
from faker import Faker
from pyspark.sql.functions import *
from datetime import datetime, timedelta

In [14]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Module2Exercise") \
    .config("spark.driver.memory", "512m") \
    .config("spark.executor.memory", "512m") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

26/02/08 06:45:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [19]:
path_data = "../module-3/data_ecommerce_indonesia_1jt"

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("price", FloatType(), True)
])

df = spark.read.schema(schema).csv(f"{path_data}").cache()

26/02/08 06:47:06 WARN CacheManager: Asked to cache already cached data.


In [21]:
df.rdd.getNumPartitions()

4

In [22]:
df = df.repartition(64).cache()  # Repartition to 64 partitions and cache the DataFrame

In [23]:
df.rdd.getNumPartitions()



64

In [24]:
# 1. Buktikan kalau Jakarta & Surabaya paling ramai (Realistic Check)
print("--- Top 10 Kota dengan Transaksi Tertinggi ---")
df.groupBy("city").count().orderBy("count", ascending=False).show(10)

# 2. Cek kategori yang paling laku
print("--- Distribusi Kategori Produk ---")
df.groupBy("category").count().orderBy("count", ascending=False).show()

# 3. Intip 5 data terakhir
print("--- Sample 5 Data ---")
df.show(5)

--- Top 10 Kota dengan Transaksi Tertinggi ---


                                                                                

+---------------+-----+
|           city|count|
+---------------+-----+
|Jakarta Selatan|61488|
|  Jakarta Barat|61404|
|  Jakarta Timur|61339|
|  Jakarta Utara|61278|
|  Jakarta Pusat|61177|
|        Bandung|46114|
|          Medan|46075|
|       Surabaya|45861|
|          Batam|37099|
|         Bekasi|37050|
+---------------+-----+
only showing top 10 rows
--- Distribusi Kategori Produk ---


                                                                                

+-------------+------+
|     category| count|
+-------------+------+
|    Groceries|357315|
|   Elektronik|213505|
|      Fashion|171564|
|       Health|142400|
|Home & Living|115181|
+-------------+------+

--- Sample 5 Data ---
+------+--------------------+-------------+---------+-----------------+---------+
|    id|                name|         city| category|     product_name|    price|
+------+--------------------+-------------+---------+-----------------+---------+
|948446|     Samsul Palastri|    Tangerang|Groceries| Minyak Goreng 2L|2908490.0|
|733191|        Salwa Namaga|Jakarta Utara|Groceries|Beras Premium 5kg|1059434.8|
|231361|Gantar Megantara,...|    Tangerang|   Health|     Minyak Telon|2424948.0|
|348137|Sutan Vero Narpat...|      Soreang|Groceries|      Susu Steril|2974120.0|
|331727|       Teddy Santoso|      Bandung|Groceries|   Mie Instan Dus| 992308.4|
+------+--------------------+-------------+---------+-----------------+---------+
only showing top 5 rows


In [27]:
df = df.coalesce(4).cache()  # Coalesce to 4 partitions and cache the DataFrame
df.rdd.getNumPartitions()



4

In [28]:
# 1. Buktikan kalau Jakarta & Surabaya paling ramai (Realistic Check)
print("--- Top 10 Kota dengan Transaksi Tertinggi ---")
df.groupBy("city").count().orderBy("count", ascending=False).show(10)

# 2. Cek kategori yang paling laku
print("--- Distribusi Kategori Produk ---")
df.groupBy("category").count().orderBy("count", ascending=False).show()

# 3. Intip 5 data terakhir
print("--- Sample 5 Data ---")
df.show(5)

--- Top 10 Kota dengan Transaksi Tertinggi ---


                                                                                

+---------------+-----+
|           city|count|
+---------------+-----+
|Jakarta Selatan|61488|
|  Jakarta Barat|61404|
|  Jakarta Timur|61339|
|  Jakarta Utara|61278|
|  Jakarta Pusat|61177|
|        Bandung|46114|
|          Medan|46075|
|       Surabaya|45861|
|          Batam|37099|
|         Bekasi|37050|
+---------------+-----+
only showing top 10 rows
--- Distribusi Kategori Produk ---


                                                                                

+-------------+------+
|     category| count|
+-------------+------+
|    Groceries|357315|
|   Elektronik|213505|
|      Fashion|171564|
|       Health|142400|
|Home & Living|115181|
+-------------+------+

--- Sample 5 Data ---
+------+--------------------+-------------+---------+-----------------+---------+
|    id|                name|         city| category|     product_name|    price|
+------+--------------------+-------------+---------+-----------------+---------+
|948446|     Samsul Palastri|    Tangerang|Groceries| Minyak Goreng 2L|2908490.0|
|733191|        Salwa Namaga|Jakarta Utara|Groceries|Beras Premium 5kg|1059434.8|
|231361|Gantar Megantara,...|    Tangerang|   Health|     Minyak Telon|2424948.0|
|348137|Sutan Vero Narpat...|      Soreang|Groceries|      Susu Steril|2974120.0|
|331727|       Teddy Santoso|      Bandung|Groceries|   Mie Instan Dus| 992308.4|
+------+--------------------+-------------+---------+-----------------+---------+
only showing top 5 rows


In [None]:
# save df as parquet, name = "data_ecommerce_indonesia_1jt.parquet" partition by city
df.write.mode("overwrite").partitionBy("city").parquet("data_ecommerce_indonesia_1jt-parquet")

26/02/08 06:57:01 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:01 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:01 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:02 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:03 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:04 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360 bytes) of heap memory
Scaling row group sizes to 95.00% for 4 writers
26/02/08 06:57:08 WARN MemoryManager: Total allocation exceeds 95.00% (510,027,360

In [None]:
# # 1. Setup Spark - Khusus RAM 8GB
# spark = SparkSession.builder \
#     .appName("IndoRealisticGenerator") \
#     .config("spark.driver.memory", "2g") \
#     .config("spark.sql.shuffle.partitions", "1") \
#     .getOrCreate()

# fake = Faker('id_ID')

# # --- CONFIGURATION: REALISTIC CITY DISTRIBUTION ---
# # Meniru pusat ekonomi Indonesia: Jabodetabek, Surabaya, Medan, Makassar, Batam.
# kota_indo = [
#     "Jakarta Selatan", "Jakarta Pusat", "Jakarta Barat", "Jakarta Timur", "Jakarta Utara",
#     "Bandung", "Surabaya", "Medan", "Bekasi", "Tangerang", "Tangerang Selatan", "Depok",
#     "Semarang", "Palembang", "Makassar", "Bogor", "Batam", "Pekanbaru", "Bandar Lampung",
#     "Malang", "Padang", "Denpasar", "Samarinda", "Tasikmalaya", "Serang", "Banjarmasin", "Pontianak",
#     "Cimahi", "Balikpapan", "Jambi", "Surakarta", "Mataram", "Manado", "Yogyakarta", "Cilegon",
#     "Palu", "Kupang", "Ambon", "Tarakan", "Cirebon", "Bengkulu", "Pekalongan", "Kediri", "Tegal",
#     "Binjai", "Pematangsiantar", "Jayapura", "Banda Aceh", "Probolinggo", "Banjarbaru", "Lubuklinggau",
#     "Tanjungpinang", "Batu", "Bitung", "Singkawang", "Pasuruan", "Ternate", "Banjar", "Pangkalpinang",
#     "Lhokseumawe", "Madiun", "Salatiga", "Blitar", "Tanjungbalai", "Langsa", "Palopo", "Metro", 
#     "Tebing Tinggi", "Bontang", "Bima", "Gorontalo", "Pagar Alam", "Bukittinggi", "Parepare",
#     "Bau-Bau", "Gunungsitoli", "Sawahlunto", "Tomohon", "Sabang", "Jatinangor", "Sumedang", "Garut",
#     "Purwakarta", "Subang", "Karawang", "Indramayu", "Majalengka", "Kuningan", "Soreang", "Ngamprah",
#     "Ciamis", "Pangandaran", "Sukabumi", "Cianjur", "Purwokerto", "Kebumen", "Cilacap", "Magelang"
# ]

# # Fungsi pembobotan realistis
# def get_city_weight(city):
#     if "Jakarta" in city: return 20  # Pusat segalanya
#     if city in ["Surabaya", "Medan", "Bandung"]: return 15 # Tier 1
#     if city in ["Bekasi", "Tangerang", "Makassar", "Batam", "Semarang"]: return 12 # Industri & Hub
#     if city in ["Jatinangor", "Palembang", "Depok", "Yogyakarta"]: return 10 # Pendidikan & Regional
#     return 1 # Kota lainnya

# kota_weights = [get_city_weight(k) for k in kota_indo]

# # --- CONFIGURATION: CATEGORIES & PRODUCTS ---
# cat_list = ["Elektronik", "Fashion", "Groceries", "Home & Living", "Health"]
# cat_weights = [15, 12, 25, 8, 10] # Groceries & Elektronik paling tinggi

# product_map = {
#     "Elektronik": ["Smartphone", "Laptop", "Powerbank", "TWS Earbuds", "Smart TV", "Mouse Wireless", "Tablet"],
#     "Fashion": ["Kaos Oversize", "Celana Cargo", "Hoodie", "Sneakers Lokal", "Tas Selempang", "Kemeja Rayon"],
#     "Groceries": ["Beras Premium 5kg", "Minyak Goreng 2L", "Mie Instan Dus", "Susu Steril", "Kopi Kapal Api", "Sabun Cuci Piring"],
#     "Home & Living": ["Air Purifier", "Lampu Pintar", "Sprei Aesthetic", "Tumblr 1L", "Wadah Vacuum", "Rak Sepatu"],
#     "Health": ["Masker KF94", "Vitamin C 1000mg", "Hand Sanitizer", "Minyak Telon", "Sabun Antiseptik", "Madu Murni"]
# }

# schema = StructType([
#     StructField("order_id", IntegerType(), False),
#     StructField("customer_name", StringType(), True),
#     StructField("city", StringType(), True),
#     StructField("category", StringType(), True),
#     StructField("product_name", StringType(), True),
#     StructField("amount", FloatType(), True)
# ])

# # --- EXECUTION: BATCH GENERATION ---
# total_rows = 1000000
# batch_size = 100000
# temp_path = "temp_realistic_ecommerce"

# print(f"ðŸš€ Memulai Generate 1 Juta Baris Data Realistis Indonesia...")
# start_time = time.time()

# for i in range(1, (total_rows // batch_size) + 1):
#     batch_start = (i - 1) * batch_size
#     data_batch = []
    
#     for j in range(batch_size):
#         city = random.choices(kota_indo, weights=kota_weights, k=1)[0]
#         category = random.choices(cat_list, weights=cat_weights, k=1)[0]
#         product = random.choice(product_map[category])
        
#         data_batch.append((
#             batch_start + j,
#             fake.name(),
#             city,
#             category,
#             product,
#             __builtins__.round(random.uniform(10000, 3000000), 2)
#         ))
    
#     df_batch = spark.createDataFrame(data_batch, schema=schema)
#     df_batch.write.mode("append").csv(temp_path, header=(i==1))
#     print(f"âœ… Batch {i} selesai. RAM aman.")

# # --- FINAL MERGE TO SINGLE CSV ---
# print("\nðŸ“¦ Menggabungkan menjadi 1 file CSV...")
# df_final = spark.read.csv(temp_path, header=True, inferSchema=True)
# df_final.coalesce(1).write.mode("overwrite").option("header", "true").csv("data_ecommerce_indonesia_1jt")

# print(f"\nâœ¨ SELESAI dalam {time.time() - start_time:.2f} detik")

ðŸš€ Memulai Generate 1 Juta Baris Data Realistis Indonesia...


26/02/08 06:33:34 WARN TaskSetManager: Stage 0 contains a task of very large size (1044 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 1 selesai. RAM aman.


26/02/08 06:33:49 WARN TaskSetManager: Stage 1 contains a task of very large size (1093 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 2 selesai. RAM aman.


26/02/08 06:33:59 WARN TaskSetManager: Stage 2 contains a task of very large size (1094 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 3 selesai. RAM aman.


26/02/08 06:34:16 WARN TaskSetManager: Stage 3 contains a task of very large size (1093 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 4 selesai. RAM aman.


26/02/08 06:34:26 WARN TaskSetManager: Stage 4 contains a task of very large size (1095 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 5 selesai. RAM aman.


26/02/08 06:34:37 WARN TaskSetManager: Stage 5 contains a task of very large size (1092 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 6 selesai. RAM aman.


26/02/08 06:34:45 WARN TaskSetManager: Stage 6 contains a task of very large size (1095 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 7 selesai. RAM aman.


26/02/08 06:34:54 WARN TaskSetManager: Stage 7 contains a task of very large size (1092 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 8 selesai. RAM aman.


26/02/08 06:35:10 WARN TaskSetManager: Stage 8 contains a task of very large size (1094 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 9 selesai. RAM aman.


26/02/08 06:35:29 WARN TaskSetManager: Stage 9 contains a task of very large size (1094 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

âœ… Batch 10 selesai. RAM aman.

ðŸ“¦ Menggabungkan menjadi 1 file CSV...


26/02/08 06:35:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 224576, Dalima Pratama, Semarang, Groceries, Kopi Kapal Api, 305210.53
 Schema: 824576, Wasis Narpati, Jayapura, Elektronik, TWS Earbuds, 1352703.9
Expected: 824576 but found: 224576
CSV file: file:///home/sanju3291/DATA-ENGINEER/03-tools-and-setup/technology-stack/06-spark-processing/notebooks/module-3/temp_realistic_ecommerce/part-00001-db6bc768-0754-4789-a6d1-52c3a3ee7d1a-c000.csv
26/02/08 06:35:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 524576, Kenes Wasita, Tasikmalaya, Groceries, Kopi Kapal Api, 213802.64
 Schema: 824576, Wasis Narpati, Jayapura, Elektronik, TWS Earbuds, 1352703.9
Expected: 824576 but found: 524576
CSV file: file:///home/sanju3291/DATA-ENGINEER/03-tools-and-setup/technology-stack/06-spark-processing/notebooks/module-3/temp_realistic_ecommerce/part-00001-e489b829-733a-4fc5-ab13-c35e9e4c470d-c000.csv
26/02/08 06:35:40 WARN CSVHeaderChecker


âœ¨ SELESAI dalam 160.54 detik
