In [0]:
df = spark.table("plstocks.bronze_dividend_history")

In [0]:
df.display()

In [0]:
df.createOrReplaceTempView("TempDividendHistory")
spark.sql("SELECT * FROM TempDividendHistory WHERE ticker IS NULL OR dividend_year IS NULL OR dividend_per_share IS NULL OR dividend_value IS NULL OR payment_date IS NULL").display()

In [0]:
%sql

SELECT * FROM TempDividendHistory WHERE payment_date >= '3000-1-1';

In [0]:
count_with_distinct = spark.sql("SELECT DISTINCT COUNT(*) FROM TempDividendHistory").collect()
count_without_distinct = spark.sql("SELECT COUNT(*) FROM TempDividendHistory").collect()
print(count_with_distinct, count_without_distinct)


In [0]:
from pyspark.sql.functions import col, format_number
from pyspark.sql.types import FloatType, DecimalType

silver_df = df \
    .na.drop(how="any") \
    .dropDuplicates() \
    .withColumn("dividend_year", col("dividend_year").cast("int")) \
    .withColumn("dividend_per_share", col("dividend_per_share").cast(DecimalType(12, 2))) \
    .withColumn("dividend_value", col("dividend_value").cast(DecimalType(12, 0))) \
    .withColumn("dividend_value", col("dividend_value")*1000) \
    .withColumn("payment_date", col("payment_date").cast("date"))
silver_df.display()

In [0]:
from delta.tables import DeltaTable

table_name = 'plstocks.silver_dividend_history'
if spark.catalog.tableExists(table_name):
    ExistingDididendTable = DeltaTable.forName(spark, table_name)
    ExistingDididendTable.alias("existing") \
        .merge(
            silver_df.alias("updates"),
            "existing.ticker = updates.ticker AND existing.dividend_year = updates.dividend_year"
        ) \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()
else:
    silver_df.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable(table_name)