In [1]:
import pyspark.sql.functions as F
import json
from cdk.services.api.energi_data_service import EnergiDataService
from cdk.common_modules.access.secrets import Secrets
from cdk.common_modules.spark.spark_config import SparkConfig
from cdk.common_modules.spark.spark_session_builder import SparkSessionBuilder

storage_account_name = "adlsthuehomelakehousedev"

# Set Spark configurations
spark_config = SparkConfig(Secrets())

# Add jars to install
spark_config.add_jars_to_install(['hudi', 'azure_storage', 'delta'])

# Add storage account access
spark_config.add_storage_account_access(storage_account_name, method='access_key')

# Build SparkSession
spark = SparkSessionBuilder("ViewAzureData", spark_config).build()


INFO:root:Loading secret ADLS_adlsthuehomelakehousedev_access_key


:: loading settings :: url = jar:file:/opt/bitnami/python/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hudi#hudi-spark3.3-bundle_2.12 added as a dependency
org.apache.hadoop#hadoop-azure added as a dependency
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a7877af2-ce8d-43e0-9a1d-87eb5a4252b9;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.3-bundle_2.12;0.13.1 in central
	found org.apache.hadoop#hadoop-azure;3.3.3 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found com.microsoft.azure#azure-storage;7.0.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.13.2 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found com.microsoft.azure#azure-keyvault-core;1.0.0 in central
	found com.google.guava#guava;

23/12/10 21:17:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
dataset_name = "stock_transactions"
dataset_path = f"nordea"#/20231210/{dataset_name}"
container_name = "landing"
destination_container_name = "bronze"

# landing_path = f"hdfs://namenode:9000/data/landing/energi_data_service/{dataset_name}"
landing_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{dataset_path}"
bronze_path = f"abfss://{destination_container_name}@{storage_account_name}.dfs.core.windows.net/{dataset_path}/{dataset_name}"



In [5]:
import datetime
df = (
    spark.read.csv(landing_path, header=True, sep=";")
    .withColumn("date", F.to_date(F.concat_ws('-','year','month','day'), "yyyy-MM-dd"))
    .filter((F.col("date")>datetime.date(2023,12,9)))
)

df.show()

                                                                                

+------+--------------------+-------------+----------------+-----+------------+----+----+-----+---+----------+
|Status|     Instrument navn|Opdateret den|Transaktionstype|Antal|       Total| _c6|year|month|day|      date|
+------+--------------------+-------------+----------------+-----+------------+----+----+-----+---+----------+
|Udført|      A/s Jyske Bank|   06-12-2023|         Udbytte|  0.0|  101,14 DKK|null|2023|   12| 10|2023-12-10|
|Udført|    TORM plc A Aktie|   05-12-2023|         Udbytte|  0.0|  279,72 DKK|null|2023|   12| 10|2023-12-10|
|Udført|Meta Platforms Inc A|   05-12-2023|            Sælg|  2.0|4.373,11 DKK|null|2023|   12| 10|2023-12-10|
|Udført|      Microsoft Corp|   04-12-2023|            Sælg|  2.0|5.142,21 DKK|null|2023|   12| 10|2023-12-10|
|Udført|          D/S Norden|   28-11-2023|             Køb| 13.0|4.272,20 DKK|null|2023|   12| 10|2023-12-10|
|Udført|       Royal UNIBREW|   27-11-2023|             Køb| 10.0|4.633,00 DKK|null|2023|   12| 10|2023-12-10|
|

                                                                                

In [3]:
df = (
    spark.read.format('delta').load(bronze_path)
)

df.show()

                                                                                

23/12/10 21:18:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 9:>                                                          (0 + 1) / 1]

+------+--------------------+-------------+----------------+-----+------------+----+-----+---+----------+--------------------+
|Status|     Instrument_navn|Opdateret_den|Transaktionstype|Antal|       Total|year|month|day|      date|       dwh_updatedAt|
+------+--------------------+-------------+----------------+-----+------------+----+-----+---+----------+--------------------+
|Udført|      A/s Jyske Bank|   06-12-2023|         Udbytte|  0.0|  101,14 DKK|2023|   12| 10|2023-12-10|2023-12-10 21:16:...|
|Udført|    TORM plc A Aktie|   05-12-2023|         Udbytte|  0.0|  279,72 DKK|2023|   12| 10|2023-12-10|2023-12-10 21:16:...|
|Udført|Meta Platforms Inc A|   05-12-2023|            Sælg|  2.0|4.373,11 DKK|2023|   12| 10|2023-12-10|2023-12-10 21:16:...|
|Udført|      Microsoft Corp|   04-12-2023|            Sælg|  2.0|5.142,21 DKK|2023|   12| 10|2023-12-10|2023-12-10 21:16:...|
|Udført|          D/S Norden|   28-11-2023|             Køb| 13.0|4.272,20 DKK|2023|   12| 10|2023-12-10|2023-1

                                                                                