In [0]:
import re
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, DateType, LongType
import pprint as pp
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

In [0]:
%sql

create schema if not exists google_fit.silver

In [0]:
base_config_dict = {
    "activities": {
        "bronze_table_name": "google_fit.bronze.activities",
        "silver_table_name": "google_fit.silver.activities",
    },
    "all_sessions": {
        "bronze_table_name": "google_fit.bronze.all_sessions",
        "silver_table_name": "google_fit.silver.all_sessions",
    },
    "daily_activity_metrics": {
        "bronze_table_name": "google_fit.bronze.daily_activity_metrics",
        "silver_table_name": "google_fit.silver.daily_activity_metrics",
    }
}

In [0]:
def build_silver_ingestion_config_dict():
    """
    builds and returns the silver ingestion dict based on the basic config dict with the corresponding max ingestion timestamps of the bronze tables
    """
    
    # test_dict = {"google_fit.bronze.activities": "2025-09-12 11:03:56.009982", "google_fit.bronze.all_sessions": "2025-09-12 11:03:59.785936", "google_fit.bronze.daily_activity_metrics": "2025-09-12 11:02:53.288683"}
    bronze_max_ingestion_timestamps_dict = json.loads(dbutils.jobs.taskValues.get(taskKey= "bronze_ingestion", key= "bronze_max_ingestion_timestamps_dict"))

    silver_ingestion_config_dict = {k: v for k, v in base_config_dict.items()}

    for table_name, timestamp in bronze_max_ingestion_timestamps_dict.items():
        for feed, map_tables in silver_ingestion_config_dict.items():
            if table_name == map_tables['bronze_table_name']:
                map_tables['max_bronze_ingestion_timestamp'] = timestamp

    return silver_ingestion_config_dict

In [0]:
silver_ingestion_config_dict = build_silver_ingestion_config_dict()

In [0]:
pp.pprint(silver_ingestion_config_dict)

In [0]:
def null_check(df):
    null_col_names = list()
    for col in df.columns:
        if df.filter(F.col(col).isNull()).count() > 0:
            null_col_names.append(col)
        else:
            continue
    return null_col_names

def common_cleanup(df):
    current_cols = df.columns
    new_cols = [re.sub(r"_{2,}", "_", col.lower().replace("exploded", "").strip().strip('_')) for col in current_cols]

    df = df.toDF(*new_cols)
    df = (
        df.withColumn('entity', F.initcap(F.col('entity')))
                     .select('entity', *[col for col in df.columns if col != 'entity'])
    )
    for col_info in df.schema:
        if isinstance(col_info.dataType, DoubleType):
            df = df.withColumn(col_info.name, F.round(col_info.name, 5))
    return df


In [0]:
def activities_silver_ingestion(silver_ingestion_config_dict):
    bronze_table = silver_ingestion_config_dict['activities']['bronze_table_name']
    silver_table = silver_ingestion_config_dict['activities']['silver_table_name']

    activities_df = spark.table(bronze_table)
    activities_df = common_cleanup(activities_df)
    
    # return activities_df
    
    if not spark.catalog.tableExists(silver_table):
        print(f"Starting overwrite operation for the table: {silver_table}")
        activities_df.write.saveAsTable(silver_table)
        print(f"Overwrite operation was successful on the table :{silver_table}")
    else:
        max_bronze_ingestion_timestamp = silver_ingestion_config_dict['activities']['max_bronze_ingestion_timestamp']
        activities_df = activities_df.filter(F.col('etl_timestamp') > F.lit(max_bronze_ingestion_timestamp))
        print(f"Starting append operation for the table: {silver_table}")
        activities_df.write.mode("append").saveAsTable(silver_table)
        print(f"Append operation was successful on the table :{silver_table}")


In [0]:
def all_sessions_silver_ingestion(silver_ingestion_config_dict):
    bronze_table = silver_ingestion_config_dict['all_sessions']['bronze_table_name']
    silver_table = silver_ingestion_config_dict['all_sessions']['silver_table_name']

    all_sessions_df = spark.table(bronze_table)
    all_sessions_df = common_cleanup(all_sessions_df)

    all_sessions_df = (
        all_sessions_df.withColumn('duration_secs', F.regexp_replace(F.col('duration'), "\\d(s)$", 1))
                        .drop('duration')
                        .withColumn('endtime', F.to_utc_timestamp(F.col('endtime'), "Z"))
                        .withColumn('starttime', F.to_utc_timestamp(F.col('starttime'), "Z"))
                        .withColumn('segment_endtime', F.to_utc_timestamp(F.col('segment_endtime'), "Z"))
                        .withColumn('segment_starttime', F.to_utc_timestamp(F.col('segment_starttime'), "Z"))
                        .fillna({"aggregate_floatvalue": 0, "aggregate_intvalue": 0})
                        .withColumn('aggregate_metricname', F.regexp_replace(F.regexp_replace(F.col('aggregate_metricname'), "^com\\.google\\.", ""), "\\.", "_"))
                        .withColumn('segment_fitnessactivity', F.initcap(F.col('segment_fitnessactivity')))
    )
    
    # return all_sessions_df

    if not spark.catalog.tableExists(silver_table):
        print(f"Starting overwrite operation for the table: {silver_table}")
        all_sessions_df.write.saveAsTable(silver_table)
        print(f"Overwrite operation was successful on the table :{silver_table}")
    else:
        max_bronze_ingestion_timestamp = silver_ingestion_config_dict['all_sessions']['max_bronze_ingestion_timestamp']
        all_sessions_df = all_sessions_df.filter(F.col('etl_timestamp') > F.lit(max_bronze_ingestion_timestamp))
        print(f"Starting append operation for the table: {silver_table}")
        all_sessions_df.write.mode("append").saveAsTable(silver_table)
        print(f"Append operation was successful on the table :{silver_table}")

In [0]:
def daily_activity_metrics_silver_ingestion(silver_ingestion_config_dict):
    bronze_table = silver_ingestion_config_dict['daily_activity_metrics']['bronze_table_name']
    silver_table = silver_ingestion_config_dict['daily_activity_metrics']['silver_table_name']

    daily_activity_metrics_df = spark.table(bronze_table)

    for col in daily_activity_metrics_df.columns:
        if(re.search(r'date', col.lower())):
            daily_activity_metrics_df = daily_activity_metrics_df.withColumn(col, F.when(F.col(col).isNotNull(), F.col(col).cast(DateType())).otherwise(None))
        elif re.search(r'count|minutes', col.lower()):
            daily_activity_metrics_df = daily_activity_metrics_df.withColumn(col, F.when(F.col(col).isNotNull(), F.col(col).cast(IntegerType())).otherwise(None))
        elif re.search(r'speed|distance|kcal|weight|points', col.lower()):
            daily_activity_metrics_df = daily_activity_metrics_df.withColumn(col, F.when(F.col(col).isNotNull(), F.col(col).cast(DoubleType())).otherwise(None))
        elif(re.search(r'duration', col.lower())):
            daily_activity_metrics_df = daily_activity_metrics_df.withColumn(col, F.when(F.col(col).isNotNull(), F.col(col).cast(LongType())).otherwise(None))

    daily_activity_metrics_df = daily_activity_metrics_df.fillna(0)
    daily_activity_metrics_df = common_cleanup(daily_activity_metrics_df)
    cur_cols = daily_activity_metrics_df.columns
    daily_activity_metrics_df = daily_activity_metrics_df.toDF(*[col.replace("_m_s", "_ms") for col in cur_cols])

    # return daily_activity_metrics_df

    if not spark.catalog.tableExists(silver_table):
        print(f"Starting overwrite operation for the table: {silver_table}")
        daily_activity_metrics_df.write.saveAsTable(silver_table)
        print(f"Overwrite operation was successful on the table :{silver_table}")
    else:
        max_bronze_ingestion_timestamp = silver_ingestion_config_dict['daily_activity_metrics']['max_bronze_ingestion_timestamp']
        daily_activity_metrics_df = daily_activity_metrics_df.filter(F.col('etl_timestamp') > F.lit(max_bronze_ingestion_timestamp))
        print(f"Starting append operation for the table: {silver_table}")
        daily_activity_metrics_df.write.mode("append").saveAsTable(silver_table)
        print(f"Append operation was successful on the table :{silver_table}")

In [0]:
def silver_ingestion(silver_ingestion_config_dict_dict):
    futures = []
    with ThreadPoolExecutor(len(silver_ingestion_config_dict_dict)) as e:
        activities_silver_ingestion_future = e.submit(activities_silver_ingestion, silver_ingestion_config_dict_dict)
        futures.append(activities_silver_ingestion_future)
        all_sessions_silver_ingestion_future = e.submit(all_sessions_silver_ingestion, silver_ingestion_config_dict_dict)
        futures.append(all_sessions_silver_ingestion_future)
        daily_activity_metrics_silver_ingestion_future = e.submit(daily_activity_metrics_silver_ingestion, silver_ingestion_config_dict_dict)
        futures.append(daily_activity_metrics_silver_ingestion_future)
      
    for f in as_completed(futures):
        f.result()
    

In [0]:
silver_ingestion(silver_ingestion_config_dict)

In [0]:
dbutils.notebook.exit("Success")

In [0]:
spark.table(silver_ingestion_config['activities']['silver_table_name']).count() == spark.table(silver_ingestion_config['activities']['bronze_table_name']).count()
