In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F
catalog_name = 'ecommerce'

## Brands

In [0]:
brand_schema = StructType([
    StructField('brand_code', StringType(),False),
    StructField('brand_name', StringType(), True),
    StructField('category_code', StringType(), True)
])

raw_path = '/Volumes/ecommerce/source_data/raw/ecomm-raw-data/brands/brands.csv'
df = spark.read.csv(raw_path, header=True, schema=brand_schema)

# metadata 
df = df.withColumn('ingested_at',F.current_timestamp())\
    .withColumn('_source_file',F.col("_metadata.file_path"))\


df.write.format('delta')\
    .mode('overwrite')\
    .option('overwriteSchema','true')\
    .saveAsTable(f'{catalog_name}.bronze.brz_brands')


display(df.limit(5))   


## Customers 

In [0]:
from pyspark.sql.types import FloatType

customers_schema = StructType([
    StructField('customer_id', StringType(),False),
    StructField('phone', FloatType(), True),
    StructField('country_code', StringType(), True),
    StructField('country', StringType(), True),
    StructField('state', StringType(), True)]
)

raw_path = '/Volumes/ecommerce/source_data/raw/ecomm-raw-data/customers/customers.csv'
# metadata 
df = spark.read.csv(raw_path, header=True, schema=customers_schema)\
    .withColumn('ingested_at',F.current_timestamp())\
    .withColumn('_source_file',F.col("_metadata.file_path"))


df.write.format('delta')\
    .mode('overwrite')\
    .option('overwriteSchema','true')\
    .saveAsTable(f'{catalog_name}.bronze.brz_customers')
display(df.limit(5))

# Category

In [0]:
category_schema = StructType(
    [StructField('category_code', StringType(),False),
    StructField('category_name', StringType(), False)]
)

raw_path = '/Volumes/ecommerce/source_data/raw/ecomm-raw-data/category/category.csv'
# metadata 
df = spark.read.csv(raw_path, header=True, schema=category_schema)\
    .withColumn("_ingested_at", F.current_timestamp()) \
    .withColumn("_source_file", F.col("_metadata.file_path"))

df.write.format('delta')\
    .mode('overwrite')\
    .option('overwriteSchema','true')\
    .saveAsTable(f'{catalog_name}.bronze.brz_category')

display(df.limit(5))

## Products

In [0]:
from pyspark.sql.types import LongType

products_schema=StructType(
   [
    StructField('product_id', LongType(),False),
    StructField('sku', StringType(), True),
    StructField('category_code', StringType(), True),
    StructField('brand_code', StringType(), True),
    StructField('color', StringType(), True),
    StructField('size', StringType(), True),
    StructField('material', StringType(), True),
    StructField('weight_grams', StringType(), True),
    StructField('length_cm', StringType(), True),
    StructField('width_cm', FloatType(), True),
    StructField('height_cm', FloatType(), True),
    StructField('rating_count', FloatType(), True)
    ]
)

raw_path = '/Volumes/ecommerce/source_data/raw/ecomm-raw-data/products/products.csv'
# metadata 
df = spark.read.csv(raw_path, header=True, schema=products_schema)\
    .withColumn("_ingested_at", F.current_timestamp()) \
    .withColumn("_source_file", F.col("_metadata.file_path"))


df.write.format('delta')\
    .mode('overwrite')\
    .option('overwriteSchema','true')\
    .saveAsTable(f'{catalog_name}.bronze.brz_products')
    
display(df.limit(5))

In [0]:
from pyspark.sql.types import DateType

date_schema = StructType([
    StructField('date', StringType(),False),
    StructField('year', IntegerType(), True),
    StructField('day_name', StringType(), True),
    StructField('quarter', IntegerType(), True),
    StructField('week_of_year', IntegerType(), True)
])

raw_path = '/Volumes/ecommerce/source_data/raw/ecomm-raw-data/date/date.csv'

# metadata 
df = spark.read.csv(raw_path, header=True, schema=date_schema)\
    .withColumn("_ingested_at", F.current_timestamp()) \
    .withColumn("_source_file", F.col("_metadata.file_path"))


df.write.format('delta')\
    .mode('overwrite')\
    .option('overwriteSchema','true')\
    .saveAsTable(f'{catalog_name}.bronze.brz_date')
    
display(df.limit(5))