In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    to_date, first, last, max as _max, min as _min, sum as _sum, avg, 
    input_file_name, regexp_extract, col, log, exp
)
from pyspark.sql.window import Window

In [2]:
# Create Spark session with necessary JARs for S3 access
spark = SparkSession.builder \
    .appName("LinearRegressionModel") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.568") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "com.amazonaws.auth.profile.ProfileCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
    .getOrCreate()

In [3]:
df = spark.read.parquet("C:/Users/dumas/OneDrive/Documents/School/SJSU/DATA 228/dataset")

# Extract symbol from filename (e.g., 1INCH-BTC.parquet), and date from open_time column
df = df.withColumn("symbol", regexp_extract(input_file_name(), r"([^/]+)\.parquet$", 1)) \
    .withColumn("date", to_date("open_time"))

df.printSchema()
df.show(5)

root
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- quote_asset_volume: float (nullable = true)
 |-- number_of_trades: integer (nullable = true)
 |-- taker_buy_base_asset_volume: float (nullable = true)
 |-- taker_buy_quote_asset_volume: float (nullable = true)
 |-- open_time: timestamp_ntz (nullable = true)
 |-- symbol: string (nullable = false)
 |-- date: date (nullable = true)

+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+----------------------------+-------------------+--------+----------+
|   open|   high|    low|  close|  volume|quote_asset_volume|number_of_trades|taker_buy_base_asset_volume|taker_buy_quote_asset_volume|          open_time|  symbol|      date|
+-------+-------+-------+-------+--------+------------------+----------------+---------------------------+-----------------------

# Preprocessing 0.1% of the data

In [4]:
# Drop unnecessary columns
processed_df = df.drop("quote_asset_volume", "number_of_trades", "taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "open_time")

# Drop rows with nulls in critical columns
processed_df = processed_df.dropna(subset=["open", "high", "low", "close", "volume", "date"])

In [5]:
sample_df = processed_df.sample(fraction=0.001, seed=42)
print("Rows:", sample_df.count())


Rows: 1506508


In [6]:
# repartition the DataFrame by symbol to optimize for groupBy operations
sample_df = sample_df.repartition("symbol")

sample_df = sample_df.groupBy("symbol", "date").agg(
    first("open").alias("open"),
    last("close").alias("close"),
    _max("high").alias("high"),
    _min("low").alias("low"),
    _sum("volume").alias("volume"),
)

In [7]:
# Add additional features, like daily return and volatility
sample_df = sample_df.withColumn("daily_return", (col("close") - col("open")) / col("open")) \
       .withColumn("volatility", (col("high") - col("low")) / col("open"))


In [8]:
sample_df = sample_df.repartition("symbol")

# Cumulative return
window_spec = Window.partitionBy("symbol").orderBy("date") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Moving Averages (7-day and 30-day)
window_7 = Window.partitionBy("symbol").orderBy("date").rowsBetween(-6, 0)
window_30 = Window.partitionBy("symbol").orderBy("date").rowsBetween(-29, 0)

# Calculate cumulative return and moving averages
sample_df = sample_df.withColumn("ma_7", avg("close").over(window_7)) \
    .withColumn("ma_30", avg("close").over(window_30)) \
    .withColumn("cumulative_return", exp(_sum(log(1 + col("daily_return"))).over(window_spec)) - 1) 

In [9]:
# Drop rows with nulls in critical columns
sample_df = sample_df.dropna(subset=["ma_7", "ma_30", "cumulative_return", "daily_return", "volatility"])

In [None]:
sample_df.cache()
sample_df.show(5)  # Materialize cache

# Linear Regression

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [None]:
# Rename the target variable to "label"
final_df = sample_df.withColumnRenamed("daily_return", "label")
final_df.printSchema
final_df.show(5)

In [None]:
# Encode and string index the categorical features
symbol_indexer = StringIndexer(inputCol="symbol", outputCol="symbol_index", handleInvalid="skip")
symbol_encoder = OneHotEncoder(inputCols=["symbol_index"], outputCols=["symbol_ohe"])

In [None]:
# Select relevant columns for modeling
numeric_features = ['open', 'high', 'low', 'close', 'volume', 
                    'volatility', 'ma_7', 'ma_30', 'cumulative_return']

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=numeric_features + ['symbol_ohe'], outputCol="features")
final_df = assembler.transform(final_df).select("features", "label")

In [None]:
# Create a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='label')


In [None]:
# Build a Pipeline for the model
pipeline = Pipeline(stages=[symbol_indexer, symbol_encoder, assembler, lr])

In [None]:
# Split the data into training and testing sets
train_data, test_data = final_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Train the linear regression model
model = pipeline.fit(train_data)

In [None]:
# Evaluate
results = model.transform(test_data)
summary = model.stages[-1].summary
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)

# Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convrert results to Pandas DataFrame for easier manipulation
pandas_df = results.select("label", "prediction").sample(fraction=0.1).toPandas()

In [None]:
# Scatter plot of actual vs predicted daily return
plt.figure(figsize=(6, 6))
plt.scatter(pandas_df["label"], pandas_df["prediction"], alpha=0.5)
plt.xlabel("Actual Daily Return")
plt.ylabel("Predicted Daily Return")
plt.title("Predicted vs Actual Daily Return")
plt.axhline(0, color='gray', linestyle='--')
plt.axvline(0, color='gray', linestyle='--')
plt.grid(True)
plt.show()


In [None]:
# Histogram of distribution of actual vs predicted daily return
plt.figure(figsize=(8, 4))
plt.hist(pandas_df["label"], bins=50, alpha=0.6, label="Actual", color='blue')
plt.hist(pandas_df["prediction"], bins=50, alpha=0.6, label="Predicted", color='orange')
plt.xlabel("Daily Return")
plt.ylabel("Frequency")
plt.title("Histogram: Actual vs Predicted Returns")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Compute residuals
pandas_df["residual"] = pandas_df["prediction"] - pandas_df["label"]

# Summary statistics
print("Residual Mean:", pandas_df["residual"].mean())
print("Residual Std Dev:", pandas_df["residual"].std())
print("Residual Min:", pandas_df["residual"].min())
print("Residual Max:", pandas_df["residual"].max())

# Plot residuals
plt.figure(figsize=(6, 4))
plt.scatter(pandas_df["label"], pandas_df["residual"], alpha=0.5)
plt.axhline(0, color='gray', linestyle='--')
plt.xlabel("Actual Daily Return")
plt.ylabel("Residual (Prediction - Actual)")
plt.title("Residual Plot")
plt.grid(True)
plt.show()

# Save a model

In [None]:
from pyspark.ml.pipeline import PipelineModel

model_path = "C:/Users/dumas/OneDrive/Documents/GitHub/CryptoSpark/linearRegression_model"

# Save the model
model.save(model_path)
