In [0]:
from pyspark.sql import functions as F, Window as W

# Leer datos de precios y noticias
prices = spark.table("silver.prices_daily").select("ticker", "date", "adj_close")
news = spark.table("silver.ticker_news_tone").select("ticker", "date", "weighted_gdelt_tone_avg", "news_count")

# Calcular la variable objetivo
w = W.partitionBy("ticker").orderBy("date")
prices = (prices
    .withColumn("adj_close_next", F.lead("adj_close", 1).over(w))
    .withColumn("target_action",
        F.when(F.col("adj_close_next") > F.col("adj_close"), F.lit("comprar"))
         .when(F.col("adj_close_next") < F.col("adj_close"), F.lit("vender"))
         .otherwise(F.lit("mantener"))
    )
)

# Unir precios y noticias
dataset = prices.join(news, on=["ticker", "date"], how="left")

# Imputar medias para días sin noticias
mean_tone = news.groupBy("ticker").agg(F.mean("weighted_gdelt_tone_avg").alias("mean_tone"))
mean_count = news.groupBy("ticker").agg(F.mean("news_count").alias("mean_count"))

dataset = (dataset
    .join(mean_tone, on="ticker", how="left")
    .join(mean_count, on="ticker", how="left")
    .withColumn("weighted_gdelt_tone_avg", F.coalesce("weighted_gdelt_tone_avg", "mean_tone"))
    .withColumn("news_count", F.coalesce("news_count", "mean_count"))
    .drop("mean_tone", "mean_count")
)

# El resultado tiene: ticker, date, adj_close, target_action, weighted_gdelt_tone_avg, news_count
dataset.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.ticker_predictor_training_dataset")