In [14]:
import kagglehub
import os
import glob
import logging

from spark import spark

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType

# Extract

In [22]:
path = kagglehub.dataset_download("mustafakeser4/looker-ecommerce-bigquery-dataset")
csv_files = glob.glob(os.path.join(path, '*.csv'))

schemas = {
    "products": StructType([
        StructField("id", StringType(), True),
        StructField("cost", DoubleType(), True),
        StructField("category", StringType(), True),
        StructField("name", StringType(), True),
        StructField("brand", StringType(), True),
        StructField("retail_price", DoubleType(), True),
        StructField("department", StringType(), True),
        StructField("sku", StringType(), True),
        StructField("distribution_center_id", IntegerType(), True)
    ]),

    "orders": StructType([
        StructField("order_id", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("status", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("returned_at", TimestampType(), True),
        StructField("shipped_at", TimestampType(), True),
        StructField("delivered_at", TimestampType(), True),
        StructField("num_of_item", IntegerType(), True)
    ]),

    "inventory_items": StructType([
        StructField("id", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("sold_at", TimestampType(), True),
        StructField("cost", DoubleType(), True),
        StructField("product_category", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("product_brand", StringType(), True),
        StructField("product_retail_price", DoubleType(), True),
        StructField("product_department", StringType(), True),
        StructField("product_sku", StringType(), True),
        StructField("product_distribution_center_id", IntegerType(), True)
    ]),

    "users": StructType([
        StructField("id", StringType(), True),
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("email", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("gender", StringType(), True),
        StructField("state", StringType(), True),
        StructField("street_address", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("city", StringType(), True),
        StructField("country", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("traffic_source", StringType(), True),
        StructField("created_at", TimestampType(), True)
    ]),

    "distribution_centers": StructType([
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True)
    ]),

    "events": StructType([
        StructField("id", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("sequence_number", IntegerType(), True),
        StructField("session_id", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("ip_address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("browser", StringType(), True),
        StructField("traffic_source", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("event_type", StringType(), True)
    ]),

    "order_items": StructType([
        StructField("id", StringType(), True),
        StructField("order_id", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("inventory_item_id", StringType(), True),
        StructField("status", StringType(), True),
        StructField("created_at", TimestampType(), True),
        StructField("shipped_at", TimestampType(), True),
        StructField("delivered_at", TimestampType(), True),
        StructField("returned_at", TimestampType(), True),
        StructField("sale_price", DoubleType(), True)
    ])
}




for file in csv_files:
    file_name = os.path.basename(file).split('.')[0]
    df = spark.read.option("header", "true").csv(file, schema=schemas[file_name])

    file_parquet = f"{file_name}.parquet"
    df.write.mode("overwrite").parquet(f"../data/{file_parquet}")

                                                                                

# Transform (aggregation)

In [27]:
for file in csv_files:
    file_name = os.path.basename(file).split('.')[0]
    file_parquet = f"{file_name}.parquet"
    df = spark.read.option("header", "true").parquet(f"../data/{file_parquet}", schema=schemas[file_name])
    print(file_name)
    df.show(2)
    df.printSchema()
    print()

products
+-----+------------------+-----------+--------------------+-----+-----------------+----------+--------------------+----------------------+
|   id|              cost|   category|                name|brand|     retail_price|department|                 sku|distribution_center_id|
+-----+------------------+-----------+--------------------+-----+-----------------+----------+--------------------+----------------------+
|13842| 2.518749990849756|Accessories|Low Profile Dyed ...|   MG|             6.25|     Women|EBD58B8A3F1D72F42...|                     1|
|13928|2.3383499148894105|Accessories|Low Profile Dyed ...|   MG|5.949999809265137|     Women|2EAC42424D12436BD...|                     1|
+-----+------------------+-----------+--------------------+-----+-----------------+----------+--------------------+----------------------+
only showing top 2 rows

root
 |-- id: string (nullable = true)
 |-- cost: double (nullable = true)
 |-- category: string (nullable = true)
 |-- name: string