In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import input_file_name, col, regexp_extract
from pyspark.sql import functions as F
import re


# Step 1: Initialize SparkSession with S3 support
spark = SparkSession.builder \
    .appName("S3 USDT Parquet Reader") \
    .config("spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.568,"
            "ml.dmlc:xgboost4j-spark_2.12:1.5.1") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", "AKIA2WFFOP3DIZ7VCICR") \
    .config("spark.hadoop.fs.s3a.secret.key", "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") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.us-east-1.amazonaws.com") \
    .getOrCreate()

# Step 2: Read all files with "USDT" in the name
s3_path = "s3a://cryptospark-dataset/archive/*USDT*.parquet"

df = spark.read.parquet(s3_path)

def extract_crypto_name(df):
    df_with_crypto = df.withColumn(
        "crypto_name",
        F.regexp_extract(F.col("source_file"), r"\/([^\/]+)-USDT\.parquet", 1)
    )
    return df_with_crypto
df_with_path = (
    spark.read.parquet("s3a://cryptospark-dataset/archive/BTC-USDT.parquet")
         .withColumn("source_file", input_file_name())
)
df_with_crypto = extract_crypto_name(df_with_path)
df_with_crypto.show()


:: loading settings :: url = jar:file:/Users/oliverzheng/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/oliverzheng/.ivy2/cache
The jars for the packages stored in: /Users/oliverzheng/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
ml.dmlc#xgboost4j-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-57ebf77f-5548-46f7-a429-500f469096e9;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.568 in central
	found ml.dmlc#xgboost4j-spark_2.12;1.5.1 in central
	found ml.dmlc#xgboost4j_2.12;1.5.1 in central
	found com.typesafe.akka#akka-actor_2.12;2.5.23 in central
	found com.typesafe#config;1.3.3 in central
	found org.scala-lang.modules#scala-java8-compat_2.12;0.8.0 in central
	found com.esotericsoftware#kryo;4.0.2 in central
	found com.esotericsoftware#reflectasm;1.11.3 in central
	found org.ow2.asm#asm;5.0.4 in 

+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------------------+-----------+
|   open|   high|    low|  close|  volume|quote_asset_volume|number_of_trades|taker_buy_base_asset_volume|taker_buy_quote_asset_volume|          open_time|         source_file|crypto_name|
+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------------------+-----------+
|4261.48|4261.48|4261.48|4261.48|1.775183|         7564.9067|               3|                   0.075183|                   320.39084|2017-08-17 04:00:00|s3a://cryptospark...|        BTC|
|4261.48|4261.48|4261.48|4261.48|     0.0|               0.0|               0|                        0.0|                         0.0|2017-08-17 04:01:00|s3a://cryptospark...|        BTC|
|4280.56|4280.56|4280.56|4280.56|0.261074|          111

In [2]:
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler

In [3]:
df = df_with_crypto.orderBy("open_time")

# Show the schema and data
df.printSchema()
df.show(5)

root
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- quote_asset_volume: float (nullable = true)
 |-- number_of_trades: integer (nullable = true)
 |-- taker_buy_base_asset_volume: float (nullable = true)
 |-- taker_buy_quote_asset_volume: float (nullable = true)
 |-- open_time: timestamp_ntz (nullable = true)
 |-- source_file: string (nullable = false)
 |-- crypto_name: string (nullable = false)





+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------------------+-----------+
|   open|   high|    low|  close|  volume|quote_asset_volume|number_of_trades|taker_buy_base_asset_volume|taker_buy_quote_asset_volume|          open_time|         source_file|crypto_name|
+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------------------+-----------+
|4261.48|4261.48|4261.48|4261.48|1.775183|         7564.9067|               3|                   0.075183|                   320.39084|2017-08-17 04:00:00|s3a://cryptospark...|        BTC|
|4261.48|4261.48|4261.48|4261.48|     0.0|               0.0|               0|                        0.0|                         0.0|2017-08-17 04:01:00|s3a://cryptospark...|        BTC|
|4280.56|4280.56|4280.56|4280.56|0.261074|          111

                                                                                

In [4]:
from pyspark.sql import functions as F

# Step 1: Add a long timestamp column to df_with_crypto
df_with_ts = df_with_crypto.withColumn("open_time_long", F.unix_timestamp("open_time"))

# Step 2: Compute the 80% quantile based on the long timestamp
split_time = df_with_ts.approxQuantile("open_time_long", [0.8], 0.0)[0]

# Step 3: Split the data using this timestamp threshold
train_data = df_with_ts.filter(F.col("open_time_long") <= split_time)
test_data = df_with_ts.filter(F.col("open_time_long") > split_time)

# Print counts
print(f"Training data count: {train_data.count()}")
print(f"Test data count: {test_data.count()}")


                                                                                

Training data count: 2202546




Test data count: 550636


                                                                                

In [5]:
window_spec = Window.orderBy("open_time_long")
train_data = train_data.withColumn("lag1", F.lag("close", 1).over(window_spec))
test_data = test_data.withColumn("lag1", F.lag("close", 1).over(window_spec))
train_data = train_data.dropna(subset=["lag1"])
test_data = test_data.dropna(subset=["lag1"])


In [6]:
assembler = VectorAssembler(inputCols=["lag1"], outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

In [7]:
train_data.select("open_time_long", "close", "lag1", "features").show(5)


25/05/09 00:52:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 16:>                                                         (0 + 1) / 1]

+--------------+-------+-------+------------------+
|open_time_long|  close|   lag1|          features|
+--------------+-------+-------+------------------+
|    1502942460|4261.48|4261.48|[4261.47998046875]|
|    1502942520|4280.56|4261.48|[4261.47998046875]|
|    1502942580|4261.48|4280.56|[4280.56005859375]|
|    1502942640|4261.48|4261.48|[4261.47998046875]|
|    1502942700|4261.48|4261.48|[4261.47998046875]|
+--------------+-------+-------+------------------+
only showing top 5 rows



                                                                                

In [8]:
import xgboost as xgb
import numpy as np

# Prepare the data for XGBoost (converting features into numpy arrays)
def prepare_data_for_xgboost(df):
    features_col = df.select("features").rdd.map(lambda row: row['features'].toArray()).collect()
    label_col = df.select("close").rdd.map(lambda row: row['close']).collect()
    return np.array(features_col), np.array(label_col)

# Prepare training and test data
X_train, y_train = prepare_data_for_xgboost(train_data)
X_test, y_test = prepare_data_for_xgboost(test_data)

# Train XGBoost model
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Regression task
    'max_depth': 6,
    'eta': 0.1,
    'silent': 1,
    'eval_metric': 'rmse'
}

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
y_pred = bst.predict(dtest)

# Evaluate the model (RMSE)
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")


25/05/09 00:52:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:52:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 00:53:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 0

RMSE: 717.1838691036731


In [10]:
pred_df = spark.createDataFrame([(float(pred),) for pred in y_pred], ["prediction"])




In [12]:
from pyspark.sql import functions as F

# Step 1: Convert the `y_pred` (predictions) into a DataFrame with one column of predictions
predictions_list = [float(pred) for pred in y_pred]
pred_df = spark.createDataFrame([(float(pred),) for pred in predictions_list], ["prediction"])

# Step 2: Add an index to both `test_data` and `pred_df` for row-wise alignment
test_data_with_index = test_data.withColumn("row_id", F.monotonically_increasing_id())
pred_df_with_index = pred_df.withColumn("row_id", F.monotonically_increasing_id())

# Step 3: Perform a join based on the index (row_id) to match predictions with the original test data
test_data_with_preds = test_data_with_index.join(pred_df_with_index, on="row_id").drop("row_id")

# Step 4: Calculate the absolute percentage error for each row
df_with_error = test_data_with_preds.withColumn(
    "abs_percentage_error", 
    abs((col("close") - col("prediction")) / col("close")) * 100
)

# Step 5: Calculate the Mean Absolute Percentage Error (MAPE)
mape = df_with_error.agg({"abs_percentage_error": "mean"}).collect()[0][0]

# Step 6: Calculate the accuracy (100% - MAPE)
accuracy = 100 - mape

print(f"MAPE: {mape}%")
print(f"Accuracy: {accuracy}%")


25/05/09 01:02:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 01:02:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 01:02:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 01:02:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 01:02:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/09 01:02:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


MAPE: 0.7045395335215743%
Accuracy: 99.29546046647843%


                                                                                