In [0]:
from pyspark.sql import functions as F, Window as W
import datetime as dt

dbutils.widgets.text("catalog", "ptd_dev")
catalog       = dbutils.widgets.get("catalog")
spark.sql(f"USE CATALOG {catalog}")

prices = spark.table("silver.prices_daily").drop("source","ingestion_ts")
news = spark.table("silver.ticker_news_tone").select("ticker", "date", "weighted_gdelt_tone_avg", "news_count")
gold = spark.table("gold.ticker_predictor_training_dataset")



In [0]:
w = W.partitionBy("ticker").orderBy("date")
dataset = prices.join(news, on=["ticker", "date"], how="left")
dataset = dataset.withColumn("prev_close", F.lag("adj_close", 1).over(w))
dataset = (dataset
    .withColumn("target_action",
        F.when(F.col("adj_close") > F.col("prev_close"), F.lit("buy"))
         .when(F.col("adj_close") < F.col("prev_close"), F.lit("sell"))
         .otherwise(F.lit("hold"))
    )
)

In [0]:
if spark.catalog.tableExists("gold.ticker_predictor_training_dataset"):
    existing = gold.withColumnRenamed("adj_close", "adj_close_existing") \
                       .withColumnRenamed("weighted_gdelt_tone_avg", "weighted_gdelt_tone_avg_existing")

    joined = dataset.join(
        existing.select("ticker", "date", "adj_close_existing", "weighted_gdelt_tone_avg_existing"),
        ["ticker", "date"],
        "left"
    )
    new_rows = joined.filter(F.col("adj_close_existing").isNull())
    modified_rows = joined.filter(
        (F.col("adj_close_existing").isNotNull()) & (
            (F.col("adj_close") != F.col("adj_close_existing")) |
            (F.col("weighted_gdelt_tone_avg") != F.col("weighted_gdelt_tone_avg_existing"))
        )
    )
    print(f"New records: {new_rows.count()}")
    print(f"Updated records: {modified_rows.count()}")
else:
    print(f"New records: {dataset.count()}")
    print("Updated records: 0")


In [0]:
dataset = dataset.drop("prev_close")
dataset.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.ticker_predictor_training_dataset")