In [1]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DecimalType, DoubleType
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
    .builder \
    .appName("Preprocessdata") \
    .getOrCreate()

23/07/11 07:33:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# base_path = "s3a://octan-labs-bsc/export-by-date"

In [4]:
base_path = "."

In [5]:
date = "2023-06-06"

In [10]:
pre_tx_schema = StructType([ \
    StructField("block_number", LongType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("gas", LongType(), True), \
    StructField("gas_used", LongType(), True), \
    StructField("gas_price", LongType(), True), \
    StructField("value", LongType(), True), \
    StructField("token_transfer", DecimalType(38, 18), True), \
    StructField("token_contract", StringType(), True), \
    StructField("volume", DecimalType(38, 18), True), \
    StructField("gas_spent", DecimalType(38, 18), True), \
    StructField("gas_spent_usd", DecimalType(38, 18), True), \
  ])

In [11]:
pre_tx_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(pre_tx_schema) \
    .load("{base_path}/pre-tx/date={date}/*.csv".format(base_path = base_path, date = date))

In [12]:
pre_tx_df.show(10)

+------------+--------------------+--------------------+-------+--------+-----------+-----------------+--------------+--------------+--------------------+--------------------+--------------------+
|block_number|        from_address|          to_address|    gas|gas_used|  gas_price|            value|token_transfer|token_contract|              volume|           gas_spent|       gas_spent_usd|
+------------+--------------------+--------------------+-------+--------+-----------+-----------------+--------------+--------------+--------------------+--------------------+--------------------+
|    28849368|0x81f403fe697cfcf...|0x2b8ce9960900859...|  44935|   44935|30000000000|                0|          null|          null|               0E-18|0.001348050000000000|0.373320382391397140|
|    28849368|0xb0644fd7af97005...|0x63714c713bf14de...| 600000|   22218| 7000000000|                0|          null|          null|               0E-18|0.000155526000000000|0.043070380024334730|
|    28849368|0

In [12]:
pre_tx_df.createOrReplaceTempView("pre_tx_df")

In [23]:
pre_tx_df.printSchema()

root
 |-- block_number: string (nullable = true)
 |-- from_address: string (nullable = true)
 |-- to_address: string (nullable = true)
 |-- gas: string (nullable = true)
 |-- receipt_gas_used: string (nullable = true)
 |-- gas_price: string (nullable = true)
 |-- value: string (nullable = true)
 |-- token_transfer: string (nullable = true)
 |-- token_contract: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- gas_spent: string (nullable = true)
 |-- gas_spent_usd: string (nullable = true)



In [22]:
pre_tx_df.groupBy("from_address").sum("volume").show(10)



AnalysisException: "volume" is not a numeric column. Aggregation function can only be applied on a numeric column.

In [26]:
# change name, symbol foreach networks

import time
from pyspark.sql import functions as F


start_time = time.time()

result_df = spark.sql("""
SELECT from_address as address, SUM(volume) FROM pre_tx_df
GROUP BY from_address
UNION ALL
SELECT to_address as address, SUM(volume) FROM pre_tx_df
GROUP BY to_address
""")


time.time() - start_time

0.07826685905456543

In [22]:
# result_df.count()

In [23]:
start_time = time.time()

result_df.repartition(1) \
    .write \
    .option("header",True) \
    .csv("{base_path}/pre-tx/date={date}/".format(base_path = base_path, date = date))

time.time() - start_time

                                                                                

7.45465874671936

In [None]:
spark.stop()