In [0]:
from pyspark.sql import functions as F, Window as W

dbutils.widgets.text("catalog", "ptd_dev")
catalog       = dbutils.widgets.get("catalog")
spark.sql(f"USE CATALOG {catalog}")

prices = spark.table("silver.prices_daily")
news = spark.table("silver.ticker_news_tone").select("ticker", "date", "weighted_gdelt_tone_avg", "news_count")
gold = spark.table("gold.ticker_predictor_training_dataset")



In [0]:
if not spark.catalog.tableExists("gold.ticker_predictor_training_dataset"):
    prices_to_process = prices
    news_to_process = news
else:
    last_valid = (gold
        .filter(F.col("adj_close").isNotNull() & F.col("weighted_gdelt_tone_avg").isNotNull())
        .groupBy("ticker")
        .agg(F.max("date").alias("last_valid_date"))
    )
    prices_to_process = prices.join(last_valid, on="ticker", how="left") \
        .filter((F.col("last_valid_date").isNull()) | (F.col("date") > F.col("last_valid_date"))) \
        .drop("last_valid_date")
    news_to_process = news.join(last_valid, on="ticker", how="left") \
        .filter((F.col("last_valid_date").isNull()) | (F.col("date") > F.col("last_valid_date"))) \
        .drop("last_valid_date")

In [0]:
display(news_to_process)

In [0]:
from pyspark.sql import functions as F, Window as W

w = W.partitionBy("ticker").orderBy("date")
prices_gold = (prices_to_process
    .withColumn("adj_close_next", F.lead("adj_close", 1).over(w))
    .withColumn("target_action",
        F.when(F.col("adj_close_next") > F.col("adj_close"), F.lit("buy"))
         .when(F.col("adj_close_next") < F.col("adj_close"), F.lit("sell"))
         .otherwise(F.lit("hold"))
    )
)

In [0]:
display(prices_gold)

In [0]:
dataset = prices_gold.join(news_to_process, on=["ticker", "date"], how="left")
dataset = dataset.filter(F.col("ticker").isNotNull() & F.col("date").isNotNull())

if spark.catalog.tableExists("gold.ticker_predictor_training_dataset"):
    existing = gold.withColumnRenamed("adj_close", "adj_close_existing") \
                       .withColumnRenamed("weighted_gdelt_tone_avg", "weighted_gdelt_tone_avg_existing")

    joined = dataset.join(
        existing.select("ticker", "date", "adj_close_existing", "weighted_gdelt_tone_avg_existing"),
        ["ticker", "date"],
        "left"
    )
    new_rows = joined.filter(F.col("adj_close_existing").isNull())
    modified_rows = joined.filter(
        (F.col("adj_close_existing").isNotNull()) & (
            (F.col("adj_close") != F.col("adj_close_existing")) |
            (F.col("weighted_gdelt_tone_avg") != F.col("weighted_gdelt_tone_avg_existing"))
        )
    )
    print(f"New records: {new_rows.count()}")
    print(f"Updated records: {modified_rows.count()}")
else:
    print(f"New records: {dataset.count()}")
    print("Updated records: 0")

dataset.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.ticker_predictor_training_dataset")