In [0]:
pip install openpyxl

In [0]:
%restart_python

# Bronze tables
We load here the data as it is, we identify the variables with unique values to set as index

https://docs.databricks.com/aws/en/tables/managed

We choose Fully managed & Databricks-native tables using Managed Delta table

In [0]:
# Databricks notebook source
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
import pandas as pd

spark = SparkSession.builder.getOrCreate()


### Create Schema if not exists

In [0]:
CATALOG_NAME = "databricks_hackathon"
SCHEMA_NAME = "bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

### helper functions to load different type of data

In [0]:
def load_lucillus_data(batch_name,pdf,CATALOG_NAME,SCHEMA_NAME):
    print("START load_lucillus_data")
    df = pdf['Lucullus Data'].copy().iloc[5:,:]
    # we need rename to keep the unit of columns
    new_columns = [f"{a}_{b}" if str(b).lower() != 'nan' else a for a, b in zip(pdf['Lucullus Data'].columns, pdf['Lucullus Data'].iloc[0,:])]
    df.columns = new_columns
    # replace special caracter that caus eproblem with spark indexing
    df.columns = df.columns.str.replace(r'[ ,;{}\(\)\n\t\s=]', '_', regex=True)
    df.Timestamp = pd.to_datetime(df.Timestamp.copy(), dayfirst=True)
    
    # Convert to Spark DataFrame
    sdf = spark.createDataFrame(df)

    # Add metadata columns
    sdf = (sdf
        .withColumn("ingestion_time", current_timestamp())
        .withColumn("project", lit("hackathon"))
        .withColumn("source_file", lit(raw_excel_path))
        .withColumn("batch_id", lit(batch_name))
    )

    return sdf

In [0]:
def load_capacitance_data(batch_name,pdf,CATALOG_NAME,SCHEMA_NAME):
    print("START load_capacitance_data")
    # add capacitance data 
    table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Capacitance"
    # filter out col with unique values to set as index
    pdf_cap = pdf['Capacitance'].copy()
    # replace special caracter that caus eproblem with spark indexing
    pdf_cap.columns = pdf_cap.columns.str.replace(r'[ ,;{}\(\)\n\t\s=]', '_', regex=True)

    cols = [(c,pdf_cap[c].nunique()) for c in pdf_cap.columns]
    cols_to_keep_as_index = [c[0] for c in cols if c[1] == 1]
    pdf_ok = pdf_cap.copy().loc[:,~pdf_cap.columns.isin(cols_to_keep_as_index)]

    cap_sdf = spark.createDataFrame(pdf_ok)
    # add ingestion metadata
    cap_sdf = (cap_sdf
        .withColumn("ingestion_time", current_timestamp())
        .withColumn("project", lit("hackathon"))
        .withColumn("source_file", lit(raw_excel_path))
        .withColumn("batch_id", lit(batch_name))
    )

    for c in cols_to_keep_as_index:
        cap_sdf = (cap_sdf
        .withColumn(c, lit(pdf_cap[c].unique()[0]))
    )

    return cap_sdf


In [0]:
def create_or_merge_table(sdf,table_name):
    from delta.tables import DeltaTable
    from pyspark.sql.functions import current_timestamp, lit


    #table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Timeseries_bronze_{batch_name}"
    #table_fqn = f"`{CATALOG_NAME}`.`{SCHEMA_NAME}`.Timeseries_bronze_{batch_name}" # Quoted FQN for SQL
    table_fqn = ".".join([f"`{a}`" for a in table_name.split(".")])
    # Define the unique merge key 
    merge_condition = f"""
        target.Timestamp = updates.Timestamp AND 
        target.batch_id = updates.batch_id AND
        target.source_file = updates.source_file
    """

    # --- Start MERGE logic ---

    if spark.catalog.tableExists(table_name):
        # The table already exists - perform a MERGE
        
        deltaTable = DeltaTable.forName(spark, table_fqn)
        
        deltaTable.alias("target") \
            .merge(
                source=sdf.alias(f"updates"),
                condition=merge_condition
            ) \
            .whenNotMatchedInsertAll() \
            .execute()
        
        print(f"Data MERGED (upserted) into existing table: {table_name}. Duplicates avoided.")
        
    else:
        sdf.write \
            .format("delta") \
            .mode("overwrite") \
            .saveAsTable(table_name)
        print(f"New table created: {table_name}")

    print("End load_capacitance_data")

### iterate over files and load data to delta lake

In [0]:
import os
import pandas as pd
from pyspark.sql.functions import current_timestamp, lit

raw_excel_path = "/Volumes/databricks_hackathon/lucullus_data/lucullus_data_raw/"
CATALOG_NAME = "databricks_hackathon"
SCHEMA_NAME = "bronze"

excel_files = [f for f in os.listdir(raw_excel_path) if f.endswith(".xlsx") and "Lucullus" in f]
print("excel",excel_files)
for f in excel_files:
    batch_name = f.split("_")[0]
    print(f"Processing {batch_name}...")
    
    pdf = pd.read_excel(raw_excel_path + f, sheet_name=None)

    try:
        sdf = load_lucillus_data(batch_name,pdf,CATALOG_NAME,SCHEMA_NAME)
        table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Timeseries_bronze_{batch_name}"
        create_or_merge_table(sdf,table_name)

    except Exception as e:
        print("There was an error to load_lucillus_data: ",e)

    try:
        sdf = load_capacitance_data(batch_name,pdf,CATALOG_NAME,SCHEMA_NAME)
        table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Capacitance_bronze_{batch_name}"
        create_or_merge_table(sdf,table_name)

    except Exception as e:
        print("There was an error to load_capacitance_data: ",e)
    



### verify that tables were created

In [0]:
def check_table_exist(table_name):
    df_check = spark.read.table(table_name)
    row_count = df_check.count()
    print(f"Verification successful: The table '{table_name}' is readable.")
    print(f"Total rows found: {row_count}")
    df_check.printSchema()


for f in excel_files:
    try:
        batch_name = f.split("_")[0]

        table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Timeseries_bronze_{batch_name}"
        check_table_exist(table_name)

        table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.Capacitance_bronze_{batch_name}"
        check_table_exist(table_name)
        
    except Exception as e :
        print("table not exists: ", e)

In [0]:
%sql
USE CATALOG databricks_hackathon;
USE SCHEMA bronze;
SELECT timeseries_raw_adv01;