## Read in Data

Maybe can be removed when directly pulled

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    window, first, last, max as Fmax, min as Fmin,
    sum as Fsum, avg as Favg, stddev, lag, col,
    split, explode, count as Fcount
)
from pyspark.sql.window import Window as W
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.stat import Summarizer

# 1) Spark session
spark = SparkSession.builder.appName("WindowedAggregation").getOrCreate()

# 2) Load raw SPY CSV (already batched pull)
stockDF = spark.read.csv(
    "/mnt/project/spy_snapshot.csv",
    header=True, inferSchema=True
).withColumn("timestamp", col("timestamp").cast("timestamp"))

# 3) Load raw GDELT CSV (already batched pull)
newsDF = spark.read.csv(
    "/mnt/project/gdelt_news.csv",
    header=True, inferSchema=True
).withColumn("timestamp", col("V2_DATE").cast("timestamp")) \
 .select(
    "timestamp",
    col("V1_5_TONE").alias("Tone"),
    "Positive","Negative","Polarity",
    "ActivityRefDensity","SelfGroupDensity",
    "WordCount","GKGRECORDID",
    "V2_ENHANCED_THEMES"
)

## Create Features

In [None]:

# 4) Aggregate SPY into 15-minute windows
stockAgg = stockDF.groupBy(window("timestamp","15 minutes").alias("w")) \
    .agg(
      first("open").alias("open"),
      Fmax("high").alias("high"),
      Fmin("low").alias("low"),
      last("close").alias("close"),
      Fsum("volume").alias("volume"),
      Fsum("trade_count").alias("trade_count"),
      Favg("vwap").alias("vwap"),
      stddev(((col("high")+col("low"))/2)).alias("volatility")
    )

# 5) Compute log‐return vs previous window
winSpec = W.orderBy("w.start")
stockFeat = stockAgg \
    .withColumn("prev_close", lag("close").over(winSpec)) \
    .withColumn("log_return", (col("close")/col("prev_close")).log()) \
    .na.fill({"log_return": 0.0}) \
    .drop("prev_close")

# 6) Aggregate news into 15-minute windows (numeric only)
newsFeat = newsDF.groupBy(window("timestamp","15 minutes").alias("w")) \
    .agg(
      Fcount("GKGRECORDID").alias("article_count"),
      Favg("Tone").alias("avg_tone"),
      Fsum("Positive").alias("sum_pos"),
      Fsum("Negative").alias("sum_neg"),
      Favg("Polarity").alias("avg_pol")
    )

# 5) News: numeric aggregates per window
numericNews = newsDF.groupBy(window("timestamp","15 minutes").alias("w")) \
    .agg(
      Fcount("GKGRECORDID").alias("article_count"),
      Favg("Tone").alias("avg_tone"),
      Fsum("Positive").alias("sum_pos"),
      Fsum("Negative").alias("sum_neg"),
      Favg("Polarity").alias("avg_pol")
    )

Additional Themes feature

In [None]:
# from pyspark.sql.window import Window as W
# from pyspark.ml.feature import HashingTF, IDF

# # 6) Prepare themes for TF–IDF
# # explode the semicolon-delimited V1_THEMES into tokens
# themesTokens = newsDF \
#   .withColumn("theme", explode(split(col("V1_THEMES"), ";"))) \
#   .withColumn("theme", col("theme")) \
#   .groupBy("GKGRECORDID","timestamp") \
#   .agg(collect_list("theme").alias("themes") )

# # 7) TF–IDF on themes
# htf = HashingTF(inputCol="themes", outputCol="tf", numFeatures=256)
# idf = IDF(inputCol="tf", outputCol="tfidf")
# tf = htf.transform(themesTokens)
# themesVec = idf.fit(tf).transform(tf)

# # 8) Aggregate theme vectors per window (mean TF–IDF)
# # use simple avg over sparse vectors: convert to array and back if needed, but Spark 3.0+ supports avg on Vector
# themeNews = themesVec.groupBy(window("timestamp","15 minutes").alias("w")) \
#     .agg( Favg("tfidf").alias("avg_tfidf") )

First Features

In [None]:
# 7) Join stock + news features
joinedFeat = stockFeat.join(newsFeat, on="w", how="left") \
    .na.fill({
      "article_count": 0,
      "avg_tone": 0.0,
      "sum_pos": 0.0,
      "sum_neg": 0.0,
      "avg_pol": 0.0,
      #"avg_tfidf":  Vectors.sparse(256, [])
    })

# 8) Preview final feature table
joinedFeat.select(
  "w.start","w.end",
  "open","high","low","close","volume","trade_count",
  "vwap","volatility","log_return",
  "article_count","avg_tone","sum_pos","sum_neg","avg_pol"#,"avg_tfidf"
).show(truncate=False)

## Prepare for pipeline

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

all_feature_cols = [
    "open","high","low","close",
    "volume","trade_count","vwap",
    "volatility","log_return",
    "article_count","avg_tone","sum_pos","sum_neg","avg_pol"
]

# Add depending if TF-IDF is used
#all_feature_cols = all_feature_cols + ["avg_tfidf"]

# 2) Create the VectorAssembler
assembler = VectorAssembler(
    inputCols=all_feature_cols,
    outputCol="raw_features"
)

# 3) Create the StandardScaler
scaler = StandardScaler(
    inputCol="raw_features",
    outputCol="features",
    withMean=True,
    withStd=True
)

## Next Steps:
-  Decide on model
    - Classifictaion Binary up or down
    - Regression

- Create graphs and metrics for evaluation and visualization

Optional:
- Create feature selection


