In [1]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DecimalType, DoubleType
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
    .builder \
    .config("fs.s3a.access.key", "Q3AM3UQ867SPQQA43P2F") \
    .config("fs.s3a.secret.key", "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG") \
    .config("fs.s3a.endpoint", "play.min.io:9000") \
    .appName("VolumeCalculation") \
    .getOrCreate()

23/06/20 02:28:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
token_transfers_schema = StructType([ \
    StructField("token_address", StringType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("value", DecimalType(38, 0), True), \
    StructField("transaction_hash", StringType(), True), \
    StructField("log_index", LongType(), True), \
    StructField("block_number", LongType(), True), \
  ])

In [5]:
transactions_schema = StructType([ \
    StructField("hash", StringType(), True), \
    StructField("nonce", LongType(), True), \
    StructField("block_hash", StringType(), True), \
    StructField("block_number", LongType(), True), \
    StructField("transaction_index", LongType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("value", DecimalType(38, 0), True), \
    StructField("gas", LongType(), True), \
    StructField("gas_price", LongType(), True), \
    StructField("input", StringType(), True), \
    StructField("block_timestamp", LongType(), True), \
    StructField("max_fee_per_gas", LongType(), True), \
    StructField("max_priority_fee_per_gas", LongType(), True), \
    StructField("transaction_type", LongType(), True) \
  ])

In [6]:
tokens_schema = StructType([ \
    StructField("address", StringType(), True), \
    StructField("symbol", StringType(), True), \
    StructField("name", StringType(), True), \
    StructField("decimals", LongType(), True), \
    StructField("total_supply", LongType(), True), \
    StructField("block_number", LongType(), True), \
  ])

In [7]:
cmc_historical_schema = StructType([ \
    StructField("id", LongType(), True), \
    StructField("rank", LongType(), True), \
    StructField("name", StringType(), True), \
    StructField("symbol", StringType(), True), \
    StructField("open", DoubleType(), True), \
    StructField("high", DoubleType(), True), \
    StructField("low", DoubleType(), True), \
    StructField("close", DoubleType(), True), \
    StructField("volume", DoubleType(), True), \
    StructField("marketCap", DoubleType(), True), \
    StructField("timestamp", LongType(), True), \
    StructField("address", StringType(), True), \
  ])

In [8]:
basePath = "s3a://bsc-test-tx-volume"

In [9]:
token_transfers_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(token_transfers_schema) \
    .load(basePath + "/token_transfers/*/*/*.csv")

In [10]:
transactions_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(transactions_schema) \
    .load(basePath + "/transactions/*/*/*.csv")

In [11]:
tokens_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(tokens_schema) \
    .load(basePath + "/tokens/*.csv")

In [12]:
cmc_historicals_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(cmc_historical_schema) \
    .load(basePath + "/cmc_historicals/*.csv")

In [13]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [14]:
token_transfers_df.createOrReplaceTempView("token_transfers_df")
transactions_df.createOrReplaceTempView("transactions_df")
tokens_df.createOrReplaceTempView("tokens_df")
cmc_historicals_df.createOrReplaceTempView("cmc_historicals_df")

In [26]:
token_volume_result_df = spark.sql("""
SELECT  tt.token_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.token_address, cmc.timestamp
""")

token_df = token_volume_result_df \
    .withColumn('tx_volume', token_volume_result_df['tx_volume'].cast(DecimalType(38, 0))) \
    .withColumnRenamed("token_address","address")

In [27]:
token_df.show(10, False)

+------------------------------------------+----------+---------+
|address                                   |timestamp |tx_volume|
+------------------------------------------+----------+---------+
|0x2170ed0880ac9a755fd29b2688956bd959f933f8|1685318399|1054769  |
|0x9c65ab58d8d978db963e63f2bfb7121627e3a739|1685318399|14030    |
|0xbf5140a22578168fd562dccf235e5d43a02ce9b1|1685318399|21100    |
|0x02caa44eb838fc0e49b73213d9d22e5f23798fda|1685318399|365      |
|0x4338665cbb7b2485a8855a139b75d5e34ab0db94|1685318399|161      |
|0x156ab3346823b651294766e23e6cf87254d68962|1685318399|24406590 |
|0xe552fb52a4f19e44ef5a967632dbc320b0820639|1685318399|1        |
|0xba2ae424d960c26247dd6c32edc70b295c744c43|1685318399|101832   |
|0x14016e85a25aeb13065688cafb43044c2ef86784|1685318399|5707     |
|0xaf53d56ff99f1322515e54fdde93ff8b3b7dafd5|1685318399|1        |
+------------------------------------------+----------+---------+
only showing top 10 rows



In [28]:
from_address_volume_result_df = spark.sql("""
SELECT  tt.from_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.from_address, cmc.timestamp
""")

from_df = from_address_volume_result_df \
    .withColumn('tx_volume', from_address_volume_result_df['tx_volume'].cast(DecimalType(38, 0))) \
    .withColumnRenamed("from_address","address")

In [29]:
from_df.show(10, False)

+------------------------------------------+----------+---------+
|address                                   |timestamp |tx_volume|
+------------------------------------------+----------+---------+
|0x344ecc034d92db380a219d4345f2d1690d22e11b|1685318399|29       |
|0x7623a96f97ee37c50405b1bd162ff9bc380f7b60|1685318399|211      |
|0xc108a9e4e1b4e2db4ac5a43bdda75f126d634caf|1685318399|219      |
|0x914558fddb93aa9b32b0e8e662673515b0f2e4ae|1685318399|823      |
|0xc2b3dbbf26d43617036b0eba53ad2dbd945adebf|1685318399|834104   |
|0x0c59a054d7ec1248c1f84fd88030ac00baa71622|1685318399|282      |
|0x63e95e19f24a7675d8adbd6f38d3c0f77bf87185|1685318399|9989     |
|0x9fe0d567c52663d371477b27d216b6d20d260a4d|1685318399|0        |
|0x2641bc9d55daf3ce52a06fa9d307c989951a5a03|1685318399|5583     |
|0x00692189a7f61483bdb67f11ec28197fae9d7755|1685318399|9105     |
+------------------------------------------+----------+---------+
only showing top 10 rows



In [30]:
to_address_volume_result_df = spark.sql("""
SELECT  tt.to_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.to_address, cmc.timestamp
""")

to_df = to_address_volume_result_df \
    .withColumn('tx_volume', to_address_volume_result_df['tx_volume'].cast(DecimalType(38, 0))) \
    .withColumnRenamed("to_address","address")

In [31]:
to_df.show(10, False)

+------------------------------------------+----------+---------+
|address                                   |timestamp |tx_volume|
+------------------------------------------+----------+---------+
|0x8126a1c71e1b5562393f65c8fe911d0c0352af72|1685318399|0        |
|0x84623f450a0af538ae3a52539633d784b664dad1|1685318399|1        |
|0x074eb000792e3710c864f8bf2478afc8df92d8cb|1685318399|0        |
|0x56ed0f243c8bd05c84ede4dafcafa69d1cec7240|1685318399|0        |
|0x04442b93e0ae32f1aa197bbc94e0b7fcf08a8524|1685318399|28       |
|0xc2b3dbbf26d43617036b0eba53ad2dbd945adebf|1685318399|402939   |
|0x62679a86f0f537a2b4e40af3305f5cee742750d9|1685318399|55       |
|0x92e4203baadb385d1355d3b5a822a084337749cf|1685318399|0        |
|0x8df5e62c8c77434547bbf3dd7528e550e80b1734|1685318399|465      |
|0x277cdce833e22d68bcddd504aef3a31ca7c584f8|1685318399|54       |
+------------------------------------------+----------+---------+
only showing top 10 rows



In [32]:
result_df = token_df.unionAll(from_df).unionAll(to_df)

In [33]:
result_df.show(10, False)



+------------------------------------------+----------+---------+
|address                                   |timestamp |tx_volume|
+------------------------------------------+----------+---------+
|0x2170ed0880ac9a755fd29b2688956bd959f933f8|1685318399|1054769  |
|0x9c65ab58d8d978db963e63f2bfb7121627e3a739|1685318399|14030    |
|0xbf5140a22578168fd562dccf235e5d43a02ce9b1|1685318399|21100    |
|0x02caa44eb838fc0e49b73213d9d22e5f23798fda|1685318399|365      |
|0x4338665cbb7b2485a8855a139b75d5e34ab0db94|1685318399|161      |
|0x156ab3346823b651294766e23e6cf87254d68962|1685318399|24406590 |
|0xe552fb52a4f19e44ef5a967632dbc320b0820639|1685318399|1        |
|0xba2ae424d960c26247dd6c32edc70b295c744c43|1685318399|101832   |
|0x14016e85a25aeb13065688cafb43044c2ef86784|1685318399|5707     |
|0xaf53d56ff99f1322515e54fdde93ff8b3b7dafd5|1685318399|1        |
+------------------------------------------+----------+---------+
only showing top 10 rows



                                                                                