In [0]:
from pyspark.sql import functions as F, Window as W

dbutils.widgets.text("catalog", "ptd_dev")
catalog       = dbutils.widgets.get("catalog")
spark.sql(f"USE CATALOG {catalog}")

prices = spark.table("silver.prices_daily").select("ticker", "date", "adj_close")
news = spark.table("silver.ticker_news_tone").select("ticker", "date", "weighted_gdelt_tone_avg", "news_count")

In [0]:
last_price_date = prices.groupBy("ticker").agg(F.max("date").alias("last_date"))

news = news.join(last_price_date, on="ticker", how="inner").filter(F.col("date") <= F.col("last_date")).drop("last_date")

w = W.partitionBy("ticker").orderBy("date")
prices = (prices
    .withColumn("adj_close_next", F.lead("adj_close", 1).over(w))
    .withColumn("target_action",
        F.when(F.col("adj_close_next") > F.col("adj_close"), F.lit("buy"))
         .when(F.col("adj_close_next") < F.col("adj_close"), F.lit("sell"))
         .otherwise(F.lit("hold"))
    )
)

In [0]:
dataset = prices.join(news, on=["ticker", "date"], how="left")

dataset = dataset.filter(F.col("ticker").isNotNull() & F.col("date").isNotNull())
dataset.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.ticker_predictor_training_dataset")