In [21]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('../../')))
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler, RobustScaler
from shared.schemas import joined_schema
from pyspark.sql.types import ArrayType, FloatType

spark = (
    SparkSession.builder.appName("DataAggregations")    
    .config("spark.sql.parquet.enableVectorizedReader", "true")
    .config("spark.sql.parquet.mergeSchema", "false") # No need as we explicitly specify the schema
    .config("spark.executor.memory", "6g")  # Increase executor memory
    .config("spark.driver.memory", "2g")    # Increase driver memory
    #.config("spark.local.dir", "/mnt/d/spark-temp")  # Change temp directory
    .getOrCreate()
)

transactions_with_aggregations_df = spark.read.schema(joined_schema).parquet("../../../results/joined_transactions_with_aggregations")
ethereum_df = transactions_with_aggregations_df.where(col("network_name")=="ethereum")
bitcoin_df = transactions_with_aggregations_df.where(col("network_name")=="bitcoin")

In [None]:
input_cols = ["total_transferred_value"]

assembler = VectorAssembler(inputCols=input_cols, outputCol="features_to_scale")
df_assembled = assembler.transform(ethereum_df)

scaler = RobustScaler(inputCol="features_to_scale", outputCol="scaled_features")
scaler_model = scaler.fit(df_assembled)
scaled_data = scaler_model.transform(df_assembled)

vector_to_array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))
scaled_data = scaled_data.withColumn("scaled_columns", vector_to_array(scaled_data["scaled_features"]))
scaled_data = scaled_data.drop("scaled_features")

for i, col_name in enumerate(input_cols):
    scaled_data = scaled_data.drop(col_name)
    scaled_data = scaled_data.withColumn(col_name, scaled_data["scaled_columns"][i])
scaled_data = scaled_data.drop("scaled_columns")

+--------------------+-------------------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------------+----------------------------+--------------------+--------------------+--------------------------+----------------------------+--------------------+--------------------+--------------------------+----------------------------+--------------------+--------------------+--------------------------+----------------------------+--------------------+---------------------+-----------------------------+-------------------------------+--------------------+--------------------+---------------------------+-----------------------------+-----------------+---------------------+-----------------------------+-------------------------------+---------------------+----------------------