In [None]:
# import argparse, sys

# parser=argparse.ArgumentParser()

# parser.add_argument("-b", "--base_path", help="S3 bucket base path", required=True)
# parser.add_argument("-d", "--date", help="Calculate date", required=True)

# args=parser.parse_args()

In [1]:
# base_path = "s3a://octan-labs-bsc/export-by-date"
base_path = "."
date = "2023-06-06"

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DecimalType, DoubleType
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
    .builder \
    .appName("TxVolumeCalculator") \
    .getOrCreate()

23/07/12 03:33:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
pre_tx_schema = StructType([ \
    StructField("block_number", LongType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("gas", LongType(), True), \
    StructField("gas_used", LongType(), True), \
    StructField("gas_price", LongType(), True), \
    StructField("value", LongType(), True), \
    StructField("token_transfer", DecimalType(38, 18), True), \
    StructField("token_contract", StringType(), True), \
    StructField("volume", DecimalType(38, 18), True), \
    StructField("gas_spent", DecimalType(38, 18), True), \
    StructField("gas_spent_usd", DecimalType(38, 18), True), \
  ])

In [5]:
pre_tx_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(pre_tx_schema) \
    .load("{base_path}/pre-tx/date={date}/*.csv".format(base_path = base_path, date = date))

In [6]:
pre_tx_df.show(10)

+------------+--------------------+--------------------+-------+--------+-----------+-----------------+--------------+--------------+--------------------+--------------------+--------------------+
|block_number|        from_address|          to_address|    gas|gas_used|  gas_price|            value|token_transfer|token_contract|              volume|           gas_spent|       gas_spent_usd|
+------------+--------------------+--------------------+-------+--------+-----------+-----------------+--------------+--------------+--------------------+--------------------+--------------------+
|    28849368|0x81f403fe697cfcf...|0x2b8ce9960900859...|  44935|   44935|30000000000|                0|          null|          null|               0E-18|0.001348050000000000|0.373320382400000000|
|    28849368|0xb0644fd7af97005...|0x63714c713bf14de...| 600000|   22218| 7000000000|                0|          null|          null|               0E-18|0.000155526000000000|0.043070380000000000|
|    28849368|0

In [8]:
pre_tx_df.createOrReplaceTempView("pre_tx_df")

In [23]:
pre_tx_df.printSchema()

root
 |-- block_number: string (nullable = true)
 |-- from_address: string (nullable = true)
 |-- to_address: string (nullable = true)
 |-- gas: string (nullable = true)
 |-- receipt_gas_used: string (nullable = true)
 |-- gas_price: string (nullable = true)
 |-- value: string (nullable = true)
 |-- token_transfer: string (nullable = true)
 |-- token_contract: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- gas_spent: string (nullable = true)
 |-- gas_spent_usd: string (nullable = true)



In [13]:
# change name, symbol foreach networks

import time
from pyspark.sql.functions import format_number


start_time = time.time()

result_df = spark.sql("""
SELECT from_address as address, SUM(volume) as volume FROM pre_tx_df
GROUP BY from_address
UNION ALL
SELECT to_address as address, SUM(volume) as volume FROM pre_tx_df
GROUP BY to_address
UNION ALL
SELECT token_contract as address, SUM(volume) as volume FROM pre_tx_df
GROUP BY token_contract
""").withColumn('volume', format_number('volume', 10)) 


time.time() - start_time

0.025368452072143555

In [14]:
# result_df.show(10, False)



+------------------------------------------+--------------+
|address                                   |volume        |
+------------------------------------------+--------------+
|0x2efc0a7e59c0508fdd93941f89743836a1d2ae75|133.2797818980|
|0x48e2779ea605bdca4467f53c9e3ec6ae419f1f9c|0.0000000000  |
|0x675d013f45810a3a5fd14877276ed2b08191862d|0.0000000000  |
|0x8eae03bee1a2ae097ac11661eec541ba538ff8fe|0.0000000000  |
|0x50f19fde93bb8b2419300ad08dd3eaac431804b0|0.0000000000  |
|0x06d5b5d1a7beaea6b5587c8b1c4ff811c808968a|0.0000000000  |
|0x3b7d6336ac01106e51f2cffd564ec7f8bdea1057|1.2140597751  |
|0x6238872a0bd9f0e19073695532a7ed77ce93c69e|0.0000000000  |
|0x9b1426d1ee4c57e9e5b64964df03884c7da1e7d6|398.0622802532|
|0x6d2d7f09e0718a4f78813fb9b1ffb5ce6abfd6d0|0.0000000000  |
+------------------------------------------+--------------+
only showing top 10 rows



                                                                                

In [15]:
start_time = time.time()

result_df.repartition(1) \
    .write \
    .option("header",True) \
    .csv("{base_path}/tx-volumes/date={date}/".format(base_path = base_path, date = date))

time.time() - start_time

                                                                                

1.8597967624664307

In [None]:
spark.stop()