In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name, regexp_extract

os.environ.setdefault("JAVA_HOME", "/usr/lib/jvm/java-11-openjdk-amd64")
os.environ.setdefault("SPARK_HOME", "/opt/spark")

# 1) Start Spark
spark = (
    SparkSession.builder
        .appName("CryptoPreprocess")
        .master("local[*]")
        .config("spark.driver.memory", "8g")
        .config("spark.executor.memory", "8g")
        .config("spark.driver.bindAddress", "127.0.0.1")
        .getOrCreate()
)

base_dir = "archive"

parquet_files = []
for root, _, files in os.walk(base_dir):
    for fn in files:
        if fn.lower().endswith(".parquet"):
            parquet_files.append(os.path.join(root, fn))

# 2) Read just those parquet paths & extract symbol
raw = spark.read.parquet(*parquet_files)
raw = raw.withColumn(
    "symbol",
    regexp_extract(input_file_name(), r"([^/\\]+)\.parquet$", 1)
)

print("✅ Loaded raw rows:", raw.count())
raw.printSchema()
raw.show(5, truncate=False)


your 131072x1 screen size is bogus. expect trouble
25/05/09 13:28:09 WARN Utils: Your hostname, Sandevistan resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/05/09 13:28:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/09 13:28:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/09 13:28:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

✅ Loaded raw rows: 1507350137
root
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- quote_asset_volume: float (nullable = true)
 |-- number_of_trades: integer (nullable = true)
 |-- taker_buy_base_asset_volume: float (nullable = true)
 |-- taker_buy_quote_asset_volume: float (nullable = true)
 |-- open_time: timestamp_ntz (nullable = true)
 |-- symbol: string (nullable = false)

+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------+
|open   |high   |low    |close  |volume  |quote_asset_volume|number_of_trades|taker_buy_base_asset_volume|taker_buy_quote_asset_volume|open_time          |symbol  |
+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+

In [None]:
from pyspark.sql.functions import col, to_date

# 2.1) Convert the existing timestamp to a simple date
enhanced = raw.withColumn("date", to_date(col("open_time")))

# 2.2) Drop the raw timestamp & unused columns, filter out nulls
processed = (
    enhanced
    .drop(
        "open_time",
        "quote_asset_volume",
        "number_of_trades",
        "taker_buy_base_asset_volume",
        "taker_buy_quote_asset_volume"
    )
    .dropna(subset=["open", "high", "low", "close", "volume"])
)

# 2.3) Compute daily_return and volatility
processed = (
    processed
    .withColumn("daily_return", (col("close") - col("open")) / col("open"))
    .withColumn("volatility",    (col("high")  - col("low"))  / col("open"))
)

print("✅ Computed daily_return & volatility — rows:", processed.count())
processed.select("symbol", "date", "daily_return", "volatility") \
         .show(5, truncate=False)


                                                                                

✅ Computed daily_return & volatility — rows: 1507350137
+--------+----------+------------+----------+
|symbol  |date      |daily_return|volatility|
+--------+----------+------------+----------+
|BTC-USDT|2017-08-17|0.0         |0.0       |
|BTC-USDT|2017-08-17|0.0         |0.0       |
|BTC-USDT|2017-08-17|0.0         |0.0       |
|BTC-USDT|2017-08-17|0.0         |0.0       |
|BTC-USDT|2017-08-17|0.0         |0.0       |
+--------+----------+------------+----------+
only showing top 5 rows



In [None]:
# Cell 3: Repartition

# 3.1) Repartition by symbol for downstream parallelism
processed = processed.repartition("symbol")
print("ℹ️  Repartitioned — partitions:", processed.rdd.getNumPartitions())

# 3.2) Peek at a tiny sample to verify everything’s still in order
processed.select("symbol", "date", "daily_return", "volatility") \
         .limit(5) \
         .show(truncate=False)




ℹ️  Repartitioned — partitions: 198




+--------+----------+---------------------+-------------------+
|symbol  |date      |daily_return         |volatility         |
+--------+----------+---------------------+-------------------+
|HNT-USDT|2020-09-24|2.42688166895055     |3.3010751716469553 |
|HNT-USDT|2020-09-24|-0.2643551809882768  |0.318638235681588  |
|HNT-USDT|2020-09-24|-0.039880629022328265|0.1251866824032755 |
|HNT-USDT|2020-09-24|-0.14504662349076586 |0.16792532827305506|
|HNT-USDT|2020-09-24|0.03261718060488578  |0.1110535620341333 |
+--------+----------+---------------------+-------------------+



                                                                                