In [None]:
# Only if using Jupyter + PySpark
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.374 pyspark-shell"


In [1]:
import findspark
findspark.init("/opt/spark")



In [2]:
from pyspark.sql import SparkSession

# Stop any old session first
try:
    spark.stop()
except:
    pass

spark = (
    SparkSession.builder
    .appName("MinioTest")
    .config("spark.master", "spark://spark-master:7077")

    # S3A core config
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")  # <-- important for HTTP MinIO

    # Credentials providers (no AWS v2 class)
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,"
        "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider"
    )

    # Multipart cleanup: override "24h" etc with numeric ms
    .config("spark.hadoop.fs.s3a.multipart.purge.age", "86400000")      # 24h
    .config("spark.hadoop.fs.s3a.multipart.purge.interval", "3600000")  # 1h

    # --- S3A timeouts (ms) ---
    .config("spark.hadoop.fs.s3a.connection.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.request.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.acquisition.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.idle.time", "60000")
    .config("spark.hadoop.fs.s3a.connection.ttl", "300000")

    # --- S3A thread pool / connection pool (ALL numeric) ---
    .config("spark.hadoop.fs.s3a.threads.max", "96")
    .config("spark.hadoop.fs.s3a.threads.keepalivetime", "60000")  # <--- key that defaulted to "60s"
    .config("spark.hadoop.fs.s3a.connection.maximum", "200")
    .config("spark.hadoop.fs.s3a.max.total.tasks", "1000")
    .config("spark.hadoop.fs.s3a.max.metadata.tasks", "100")
    .config("spark.hadoop.fs.s3a.max.total.connections", "200")

    # --- Spark-level timeouts as numbers ---
    .config("spark.files.fetchTimeout", "60000")
    .config("spark.network.timeout", "600000")
    .config("spark.rpc.askTimeout", "600000")

    # (plus the timeouts / thread configs you already set)
    .getOrCreate()
)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/12 02:30:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql import functions as F

df = spark.read.parquet("s3a://market-raw/COPN.SW/")

df.select("year", "week") \
  .distinct() \
  .orderBy("year", "week") \
  .show(100, truncate=False)


25/12/12 02:30:15 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties

+----+----+
|year|week|
+----+----+
|2020|51  |
|2020|52  |
|2020|53  |
|2021|1   |
|2021|2   |
|2021|3   |
|2021|4   |
|2021|5   |
|2021|6   |
|2021|7   |
|2021|8   |
|2021|9   |
|2021|10  |
|2021|11  |
|2021|12  |
|2021|13  |
|2021|14  |
|2021|15  |
|2021|16  |
|2021|17  |
|2021|18  |
|2021|19  |
|2021|20  |
|2021|21  |
|2021|22  |
|2021|23  |
|2021|24  |
|2021|25  |
|2021|26  |
|2021|27  |
|2021|28  |
|2021|29  |
|2021|30  |
|2021|31  |
|2021|32  |
|2021|33  |
|2021|34  |
|2021|35  |
|2021|36  |
|2021|37  |
|2021|38  |
|2021|39  |
|2021|40  |
|2021|41  |
|2021|42  |
|2021|43  |
|2021|44  |
|2021|45  |
|2021|46  |
|2021|47  |
|2021|48  |
|2021|49  |
|2021|50  |
|2021|51  |
|2021|52  |
|2022|1   |
|2022|2   |
|2022|3   |
|2022|4   |
|2022|5   |
|2022|6   |
|2022|7   |
|2022|8   |
|2022|9   |
|2022|10  |
|2022|11  |
|2022|12  |
|2022|13  |
|2022|14  |
|2022|15  |
|2022|16  |
|2022|17  |
|2022|18  |
|2022|19  |
|2022|20  |
|2022|21  |
|2022|22  |
|2022|23  |
|2022|24  |
|2022|25  |
|202

                                                                                

In [4]:
# 1) Read ALL Parquet files under COPN.SW (all years, all weeks) into ONE DataFrame
prices_all = spark.read.parquet("s3a://market-raw/COPN.SW/")

# Spark discovers year and week from folder names (year=YYYY/week=WW). [web:95][web:144]
print("Schema:")
prices_all.printSchema()

print("Row count:", prices_all.count())
prices_all.show(20, truncate=False)

# 2) (Optional) restrict to 2024â€“2025 inside the same DataFrame
prices_24_25 = prices_all.filter(F.col("year").isin(2020, 2021, 2022, 2023, 2024, 2025))

Schema:
root
 |-- ('COPN.SW', 'Open'): double (nullable = true)
 |-- ('COPN.SW', 'High'): double (nullable = true)
 |-- ('COPN.SW', 'Low'): double (nullable = true)
 |-- ('COPN.SW', 'Close'): double (nullable = true)
 |-- ('COPN.SW', 'Adj Close'): double (nullable = true)
 |-- ('COPN.SW', 'Volume'): long (nullable = true)
 |-- Date: timestamp_ntz (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)



                                                                                

Row count: 1257
+-------------------+-------------------+------------------+--------------------+------------------------+---------------------+-------------------+----+----+
|('COPN.SW', 'Open')|('COPN.SW', 'High')|('COPN.SW', 'Low')|('COPN.SW', 'Close')|('COPN.SW', 'Adj Close')|('COPN.SW', 'Volume')|Date               |year|week|
+-------------------+-------------------+------------------+--------------------+------------------------+---------------------+-------------------+----+----+
|48.5               |49.29999923706055  |47.150001525878906|47.20000076293945   |45.45343017578125       |55446                |2025-04-07 00:00:00|2025|15  |
|51.900001525878906 |51.900001525878906 |48.25             |48.70000076293945   |46.89792251586914       |24853                |2025-04-08 00:00:00|2025|15  |
|47.45000076293945  |47.45000076293945  |44.95000076293945 |45.150001525878906  |43.479286193847656      |21373                |2025-04-09 00:00:00|2025|15  |
|50.5               |50.5     

In [5]:
# Number of rows
n_rows = prices_24_25.count()

# Number of columns
n_cols = len(prices_24_25.columns)

print(f"rows = {n_rows}, columns = {n_cols}")




rows = 1257, columns = 9


                                                                                

In [6]:
prices_sample = spark.read.parquet("s3a://market-raw/COPN.SW/year=2025/week=15/")

n_rows = prices_sample.count()

# Number of columns
n_cols = len(prices_sample.columns)

print(f"rows = {n_rows}, columns = {n_cols}")

rows = 5, columns = 7


In [7]:
from pyspark.sql import Window
import pyspark.sql.functions as F

# Start from your original prices DataFrame
df = prices_24_25

# 1) Flatten the yfinance-style column names
rename_map = {
    "('COPN.SW', 'Open')"     : "Open",
    "('COPN.SW', 'High')"     : "High",
    "('COPN.SW', 'Low')"      : "Low",
    "('COPN.SW', 'Close')"    : "Close",
    "('COPN.SW', 'Adj Close')": "AdjClose",
    "('COPN.SW', 'Volume')"   : "Volume",
}

for old, new in rename_map.items():
    df = df.withColumnRenamed(old, new)

df.printSchema()
# now we have: Open, High, Low, Close, AdjClose, Volume, Date, year, week

# 2) Define windows
w5  = Window.orderBy("Date").rowsBetween(-4, 0)    # last 5 rows (including current)
w10 = Window.orderBy("Date").rowsBetween(-9, 0)    # last 10 rows
w20 = Window.orderBy("Date").rowsBetween(-19, 0)   # last 20 rows
w   = Window.orderBy("Date")                       # for next-day label (lead)

# 3) Build all rolling features + label on a single DataFrame (df_feats)
df_feats = (
    df
    # moving averages
    .withColumn("ma5_close",  F.avg("Close").over(w5))
    .withColumn("ma10_close", F.avg("Close").over(w10))
    .withColumn("ma20_close", F.avg("Close").over(w20))
    # rolling volatility (std dev over 5 days)
    .withColumn("vol5_close", F.stddev("Close").over(w5))
    # rolling min/max over 5 days
    .withColumn("min5_close", F.min("Close").over(w5))
    .withColumn("max5_close", F.max("Close").over(w5))
    # next day's close and future return
    .withColumn("next_close", F.lead("Close", 1).over(w))
    .withColumn(
        "future_ret",
        (F.col("next_close") - F.col("Close")) / F.col("Close")
    )
    # binary label: 1 if next-day return > 0, else 0
    .withColumn("label", (F.col("future_ret") > 0).cast("int"))
)

# 4) Inspect features + label
df_feats.select(
    "Date", "Close",
    "ma5_close", "ma10_close", "ma20_close",
    "vol5_close", "min5_close", "max5_close",
    "future_ret", "label"
).show(10, truncate=False)


root
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- AdjClose: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Date: timestamp_ntz (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)



25/12/12 02:30:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+-----------------+-----------------+-----------------+-----------------+------------------+----------------+-----------------+---------------------+-----+
|Date               |Close            |ma5_close        |ma10_close       |ma20_close       |vol5_close        |min5_close      |max5_close       |future_ret           |label|
+-------------------+-----------------+-----------------+-----------------+-----------------+------------------+----------------+-----------------+---------------------+-----+
|2020-12-14 00:00:00|81.0999984741211 |81.0999984741211 |81.0999984741211 |81.0999984741211 |NULL              |81.0999984741211|81.0999984741211 |0.017262657857208777 |1    |
|2020-12-15 00:00:00|82.5             |81.79999923706055|81.79999923706055|81.79999923706055|0.9899505726204885|81.0999984741211|82.5             |-0.03636363636363636 |0    |
|2020-12-16 00:00:00|79.5             |81.03333282470703|81.03333282470703|81.03333282470703|1.5011106660099545|79.5    

                                                                                

In [8]:
n_rows = df_feats.count()

# Number of columns
n_cols = len(df_feats.columns)

print(f"rows = {n_rows}, columns = {n_cols}")

rows = 1257, columns = 18


                                                                                

In [9]:
from pyspark.sql.functions import col

df = df_feats.orderBy("Date")

train_df = df.filter(col("Date") < "2025-01-01")
test_df  = df.filter(col("Date") >= "2025-01-01")

print(f"train = {train_df.count()}, test = {test_df.count()}")


                                                                                

train = 1018, test = 239


In [10]:
feature_cols = [
    "Close",
    "ma5_close", "ma10_close", "ma20_close",
    "vol5_close", "min5_close", "max5_close",
]

df_feats.select([
    F.count(F.when(F.col(c).isNull(), 1)).alias(c)
    for c in feature_cols + ["label"]
]).show(truncate=False)


25/12/12 02:30:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+---------+----------+----------+----------+----------+----------+-----+
|Close|ma5_close|ma10_close|ma20_close|vol5_close|min5_close|max5_close|label|
+-----+---------+----------+----------+----------+----------+----------+-----+
|0    |0        |0         |0         |1         |0         |0         |1    |
+-----+---------+----------+----------+----------+----------+----------+-----+



25/12/12 02:30:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [16]:
# Schema
df_feats.printSchema()

# Row count
print("Number of rows:", df_feats.count())

# Quick summary stats for main numeric columns
num_cols = [
    "Close",
    "ma5_close", "ma10_close", "ma20_close",
    "vol5_close", "min5_close", "max5_close",
    "future_ret"
]

df_feats.select(num_cols).summary().show(truncate=False)  # count, mean, stddev, min, quartiles, max


root
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- AdjClose: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Date: timestamp_ntz (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- ma5_close: double (nullable = true)
 |-- ma10_close: double (nullable = true)
 |-- ma20_close: double (nullable = true)
 |-- vol5_close: double (nullable = true)
 |-- min5_close: double (nullable = true)
 |-- max5_close: double (nullable = true)
 |-- next_close: double (nullable = true)
 |-- future_ret: double (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

Number of rows: 1257


25/12/12 02:34:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:34:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:34:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:34:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+----------------------+
|summary|Close             |ma5_close         |ma10_close        |ma20_close        |vol5_close         |min5_close        |max5_close        |future_ret            |
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+----------------------+
|count  |1257              |1257              |1257              |1257              |1256               |1257              |1257              |1256                  |
|mean   |63.3947891805887  |63.36897241930788 |63.35255082429038 |63.393793962978485|1.045956400429677  |62.09351631109925 |64.63814647252728 |3.577211119074127E-4  |
|stddev |13.66947663469863 |13.562062882507979|13.463244685934255|13.399101580100554|0.9328328131486354 |13.458749286685036|13.745919477201925|0.021407539032915646  

                                                                                

In [17]:
# Correlation of each numeric feature with future_ret
for c in num_cols:
    if c != "future_ret":
        corr_val = df_feats.stat.corr(c, "future_ret")  # Pearson correlation
        print(f"corr({c}, future_ret) = {corr_val}")


25/12/12 02:48:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(Close, future_ret) = -0.037515649731432545


25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(ma5_close, future_ret) = -0.0466288124408748


25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(ma10_close, future_ret) = -0.048768288354962476


25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(ma20_close, future_ret) = -0.050457572347377834


25/12/12 02:48:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(vol5_close, future_ret) = 0.06961873797547372


25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


corr(min5_close, future_ret) = -0.05016241417789007
corr(max5_close, future_ret) = -0.0393716294638853


25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:48:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

corr_cols = [
    "Close",
    "ma5_close", "ma10_close", "ma20_close",
    "vol5_close", "min5_close", "max5_close",
]

vec_assembler = VectorAssembler(inputCols=corr_cols, outputCol="corr_features")
df_vec = vec_assembler.transform(df_feats.select(corr_cols).na.drop())

corr_mat = Correlation.corr(df_vec, "corr_features", method="pearson").collect()[0][0]
print("Correlation matrix (flattened):")
print(corr_mat)


25/12/12 02:49:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:49:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:49:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:49:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:49:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 318:>                                                        (0 + 1) / 1]

Correlation matrix (flattened):
DenseMatrix([[1.        , 0.99337246, 0.9831542 , 0.96651084, 0.16719246,
              0.98711688, 0.99386885],
             [0.99337246, 1.        , 0.99500644, 0.98072625, 0.12561678,
              0.99641533, 0.99606784],
             [0.9831542 , 0.99500644, 1.        , 0.9925543 , 0.09777869,
              0.99399923, 0.98879533],
             [0.96651084, 0.98072625, 0.9925543 , 1.        , 0.08352771,
              0.98082775, 0.97345554],
             [0.16719246, 0.12561678, 0.09777869, 0.08352771, 1.        ,
              0.04578403, 0.20912743],
             [0.98711688, 0.99641533, 0.99399923, 0.98082775, 0.04578403,
              1.        , 0.98621874],
             [0.99386885, 0.99606784, 0.98879533, 0.97345554, 0.20912743,
              0.98621874, 1.        ]])


                                                                                

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

feature_cols = [
    "Close",
    "ma5_close", "ma10_close", "ma20_close",
    "vol5_close", "min5_close", "max5_close",
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"   # skip rows with nulls in feature columns
)

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    probabilityCol="probability"
)

pipeline = Pipeline(stages=[assembler, lr])

model = pipeline.fit(train_df)
pred_test = model.transform(test_df)


25/12/12 02:30:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/12/12 02:30:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:30:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance

In [13]:
pred_test.select(
    F.count(F.when(col("label").isNull(), 1)).alias("null_labels"),
    F.count(F.when(col("rawPrediction").isNull(), 1)).alias("null_rawPrediction")
).show()


25/12/12 02:32:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+------------------+
|null_labels|null_rawPrediction|
+-----------+------------------+
|          1|                 0|
+-----------+------------------+



                                                                                

In [14]:
pred_eval = (
    pred_test
    .filter(col("label").isNotNull())
    .filter(col("rawPrediction").isNotNull())
)

print("Rows in pred_test :", pred_test.count())
print("Rows in pred_eval:", pred_eval.count())


25/12/12 02:32:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Rows in pred_test : 239




Rows in pred_eval: 238


25/12/12 02:32:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

binary_eval = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",   # or "probability"
    metricName="areaUnderROC"
)
auc_roc = binary_eval.evaluate(pred_eval)

binary_eval_pr = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderPR"
)
auc_pr = binary_eval_pr.evaluate(pred_eval)

multi_eval = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)
accuracy = multi_eval.evaluate(pred_eval)

print(f"Test AUC-ROC: {auc_roc:.4f}")
print(f"Test AUC-PR : {auc_pr:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")



25/12/12 02:32:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 0

Test AUC-ROC: 0.4798
Test AUC-PR : 0.4758
Test Accuracy: 0.4916


25/12/12 02:32:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:32:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


trying a more complex model, non linear

In [19]:
# base df_feats already has Date, Close, and rolling features
w_ord = Window.orderBy("Date")

# 5-day ahead close
df_h5 = (
    df_feats
    .withColumn("close_t_plus_5", F.lead("Close", 5).over(w_ord))
    .withColumn(
        "future_ret_5d",
        (F.col("close_t_plus_5") - F.col("Close")) / F.col("Close")
    )
    .withColumn("label_5d", (F.col("future_ret_5d") > 0).cast("int"))
)

# keep only rows where horizon is defined (no null close_t_plus_5)
df_h5 = df_h5.na.drop(subset=["future_ret_5d", "label_5d"])


In [20]:
# 1-day return and simple momentum features
w_ret = Window.orderBy("Date")

df_h5 = (
    df_h5
    .withColumn("ret_1d", (F.col("Close") / F.lag("Close", 1).over(w_ret) - 1))
    .withColumn("ret_5d_past",
                (F.col("Close") / F.lag("Close", 5).over(w_ret) - 1))
    .withColumn("price_above_ma5",  F.col("Close") / F.col("ma5_close"))
    .withColumn("price_above_ma20", F.col("Close") / F.col("ma20_close"))
)

feature_cols = [
    "ret_1d",
    "ret_5d_past",
    "price_above_ma5",
    "price_above_ma20",
    "vol5_close"
]

# modeling DataFrame: drop any nulls in features/label_5d
df_model_5d = (
    df_h5
    .select(["Date"] + feature_cols + ["label_5d"])
    .na.drop(subset=feature_cols + ["label_5d"])
)


In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# time-based split
split_date = "2024-01-01"
train_df = df_model_5d.filter(F.col("Date") <  split_date)
test_df  = df_model_5d.filter(F.col("Date") >= split_date)

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label_5d",
    predictionCol="prediction",
    probabilityCol="probability",
    numTrees=200,
    maxDepth=6,
    seed=42
)

pipeline_rf = Pipeline(stages=[assembler, rf])

model_rf = pipeline_rf.fit(train_df)
pred_test_rf = model_rf.transform(test_df)

# filter out any rows with null label/probabilities before metrics
pred_eval_rf = pred_test_rf.filter(
    F.col("label_5d").isNotNull() & F.col("probability").isNotNull()
)

# metrics
bin_eval = BinaryClassificationEvaluator(
    labelCol="label_5d",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)
auc_roc_rf = bin_eval.evaluate(pred_eval_rf)

bin_eval_pr = BinaryClassificationEvaluator(
    labelCol="label_5d",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderPR"
)
auc_pr_rf = bin_eval_pr.evaluate(pred_eval_rf)

multi_eval = MulticlassClassificationEvaluator(
    labelCol="label_5d",
    predictionCol="prediction",
    metricName="accuracy"
)
acc_rf = multi_eval.evaluate(pred_eval_rf)

print(f"RF 5d AUC-ROC: {auc_roc_rf:.4f}")
print(f"RF 5d AUC-PR : {auc_pr_rf:.4f}")
print(f"RF 5d Accuracy: {acc_rf:.4f}")


25/12/12 02:53:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:53:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:53:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:53:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:53:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:53:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 0

RF 5d AUC-ROC: 0.5173
RF 5d AUC-PR : 0.5453
RF 5d Accuracy: 0.5187


                                                                                

In [22]:
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="label_5d",
    predictionCol="prediction",
    maxDepth=5,
    maxIter=100,
    stepSize=0.1,
    seed=42
)

pipeline_gbt = Pipeline(stages=[assembler, gbt])

model_gbt = pipeline_gbt.fit(train_df)
pred_test_gbt = model_gbt.transform(test_df)
pred_eval_gbt = pred_test_gbt.filter(
    F.col("label_5d").isNotNull() & F.col("probability").isNotNull()
)

auc_roc_gbt = bin_eval.evaluate(pred_eval_gbt)
auc_pr_gbt  = bin_eval_pr.evaluate(pred_eval_gbt)
acc_gbt     = multi_eval.evaluate(pred_eval_gbt)

print(f"GBT 5d AUC-ROC: {auc_roc_gbt:.4f}")
print(f"GBT 5d AUC-PR : {auc_pr_gbt:.4f}")
print(f"GBT 5d Accuracy: {acc_gbt:.4f}")


25/12/12 02:54:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:54:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:54:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:54:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:54:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 02:54:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/12 0

GBT 5d AUC-ROC: 0.4810
GBT 5d AUC-PR : 0.5025
GBT 5d Accuracy: 0.4793


                                                                                