In [1]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DecimalType, DoubleType
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
    .builder \
    .config("fs.s3a.access.key", "Q3AM3UQ867SPQQA43P2F") \
    .config("fs.s3a.secret.key", "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG") \
    .config("fs.s3a.endpoint", "https://play.min.io:9000") \
    .appName("VolumeCalculation") \
    .getOrCreate()

23/06/20 04:18:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
sc = spark.sparkContext

sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "1")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")

In [6]:
token_transfers_schema = StructType([ \
    StructField("token_address", StringType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("value", DecimalType(38, 0), True), \
    StructField("transaction_hash", StringType(), True), \
    StructField("log_index", LongType(), True), \
    StructField("block_number", LongType(), True), \
  ])

In [7]:
transactions_schema = StructType([ \
    StructField("hash", StringType(), True), \
    StructField("nonce", LongType(), True), \
    StructField("block_hash", StringType(), True), \
    StructField("block_number", LongType(), True), \
    StructField("transaction_index", LongType(), True), \
    StructField("from_address", StringType(), True), \
    StructField("to_address", StringType(), True), \
    StructField("value", DecimalType(38, 0), True), \
    StructField("gas", LongType(), True), \
    StructField("gas_price", LongType(), True), \
    StructField("input", StringType(), True), \
    StructField("block_timestamp", LongType(), True), \
    StructField("max_fee_per_gas", LongType(), True), \
    StructField("max_priority_fee_per_gas", LongType(), True), \
    StructField("transaction_type", LongType(), True) \
  ])

In [8]:
tokens_schema = StructType([ \
    StructField("address", StringType(), True), \
    StructField("symbol", StringType(), True), \
    StructField("name", StringType(), True), \
    StructField("decimals", LongType(), True), \
    StructField("total_supply", LongType(), True), \
    StructField("block_number", LongType(), True), \
  ])

In [9]:
cmc_historical_schema = StructType([ \
    StructField("id", LongType(), True), \
    StructField("rank", LongType(), True), \
    StructField("name", StringType(), True), \
    StructField("symbol", StringType(), True), \
    StructField("open", DoubleType(), True), \
    StructField("high", DoubleType(), True), \
    StructField("low", DoubleType(), True), \
    StructField("close", DoubleType(), True), \
    StructField("volume", DoubleType(), True), \
    StructField("marketCap", DoubleType(), True), \
    StructField("timestamp", LongType(), True), \
    StructField("address", StringType(), True), \
  ])

In [10]:
basePath = "s3a://bsc-test-tx-volume"

In [11]:
token_transfers_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(token_transfers_schema) \
    .load(basePath + "/token_transfers/*/*/*.csv")

23/06/20 04:19:41 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [12]:
transactions_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(transactions_schema) \
    .load(basePath + "/transactions/*/*/*.csv")

In [13]:
tokens_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(tokens_schema) \
    .load(basePath + "/tokens/*.csv")

In [14]:
cmc_historicals_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(cmc_historical_schema) \
    .load(basePath + "/cmc_historicals/*.csv")

In [15]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [16]:
token_transfers_df.createOrReplaceTempView("token_transfers_df")
transactions_df.createOrReplaceTempView("transactions_df")
tokens_df.createOrReplaceTempView("tokens_df")
cmc_historicals_df.createOrReplaceTempView("cmc_historicals_df")

In [19]:
token_volume_result_df = spark.sql("""
SELECT  tt.token_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.token_address, cmc.timestamp
""")

token_df = token_volume_result_df \
    .withColumn('tx_volume', token_volume_result_df['tx_volume'].cast(DecimalType(38, 6))) \
    .withColumnRenamed("token_address","address")

In [20]:
token_df.show(10, False)



+------------------------------------------+----------+---------------+
|address                                   |timestamp |tx_volume      |
+------------------------------------------+----------+---------------+
|0x2170ed0880ac9a755fd29b2688956bd959f933f8|1685318399|1054768.605324 |
|0xfce146bf3146100cfe5db4129cf6c82b0ef4ad8c|1685318399|0.000047       |
|0x9c65ab58d8d978db963e63f2bfb7121627e3a739|1685318399|14030.151616   |
|0xa4080f1778e69467e905b8d6f72f6e441f9e9484|1685318399|1657.252888    |
|0xbf5140a22578168fd562dccf235e5d43a02ce9b1|1685318399|21100.271899   |
|0x02caa44eb838fc0e49b73213d9d22e5f23798fda|1685318399|364.769479     |
|0x9fb9a33956351cf4fa040f65a13b835a3c8764e3|1685318399|223.021360     |
|0x4338665cbb7b2485a8855a139b75d5e34ab0db94|1685318399|161.363328     |
|0x156ab3346823b651294766e23e6cf87254d68962|1685318399|24406589.595120|
|0xe552fb52a4f19e44ef5a967632dbc320b0820639|1685318399|0.743871       |
+------------------------------------------+----------+---------

                                                                                

In [21]:
from_address_volume_result_df = spark.sql("""
SELECT  tt.from_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.from_address, cmc.timestamp
""")

from_df = from_address_volume_result_df \
    .withColumn('tx_volume', from_address_volume_result_df['tx_volume'].cast(DecimalType(38, 6))) \
    .withColumnRenamed("from_address","address")

In [22]:
from_df.show(10, False)



+------------------------------------------+----------+-----------+
|address                                   |timestamp |tx_volume  |
+------------------------------------------+----------+-----------+
|0x344ecc034d92db380a219d4345f2d1690d22e11b|1685318399|29.285650  |
|0x914558fddb93aa9b32b0e8e662673515b0f2e4ae|1685318399|822.841602 |
|0x76f4a2cd95bf5a0754011998ad893f55bb079a72|1685318399|1343.790739|
|0x3b756446c61b4702b34475022ae1851ba1a36691|1685318399|0.000000   |
|0xd6ff684aa01b676d0cc7cbc40a154dbf9d3d3079|1685318399|265.432143 |
|0x3d6d3ff403d82afb0673ae4182dfb1ad7ecbe5a8|1685318399|200.110599 |
|0x27bbe1d6c6b04694a94b2280ab5c424f54527b01|1685318399|300.165899 |
|0x95c95656f694a665cadc98b3f2d5da44f3404e81|1685318399|0.122929   |
|0xc108a9e4e1b4e2db4ac5a43bdda75f126d634caf|1685318399|219.351533 |
|0x88eb5797d4fd1adb62a8d73b2ba146c46d9e245d|1685318399|1.196090   |
+------------------------------------------+----------+-----------+
only showing top 10 rows



                                                                                

In [23]:
to_address_volume_result_df = spark.sql("""
SELECT  tt.to_address, cmc.timestamp, sum((tt.value / POWER(10, t.decimals)) * cmc.open) as tx_volume
FROM token_transfers_df tt
JOIN transactions_df txn ON tt.transaction_hash = txn.hash
JOIN tokens_df t ON LOWER(tt.token_address) = LOWER(t.address)
JOIN cmc_historicals_df cmc ON LOWER(cmc.address) = LOWER(tt.token_address)
WHERE txn.block_timestamp < cmc.timestamp AND txn.block_timestamp >  cmc.timestamp - 86400
GROUP BY tt.to_address, cmc.timestamp
""")

to_df = to_address_volume_result_df \
    .withColumn('tx_volume', to_address_volume_result_df['tx_volume'].cast(DecimalType(38, 6))) \
    .withColumnRenamed("to_address","address")

In [24]:
to_df.show(10, False)



+------------------------------------------+----------+-----------+
|address                                   |timestamp |tx_volume  |
+------------------------------------------+----------+-----------+
|0x2641bc9d55daf3ce52a06fa9d307c989951a5a03|1685318399|5583.014773|
|0x56ed0f243c8bd05c84ede4dafcafa69d1cec7240|1685318399|0.000000   |
|0xab78ba5e13bc29c742aba5bb7ebcc0cdc6969db5|1685318399|99.745186  |
|0x44f354aa259023b929b7a7b6d168259902334e31|1685318399|0.000000   |
|0x277cdce833e22d68bcddd504aef3a31ca7c584f8|1685318399|53.731171  |
|0x3538e99b0ad36179ecd8c1815664bbc56208e468|1685318399|0.000000   |
|0x317f03573b16b993c469473b0c8dc564df1db18c|1685318399|51.365714  |
|0x88eb5797d4fd1adb62a8d73b2ba146c46d9e245d|1685318399|0.001592   |
|0x04442b93e0ae32f1aa197bbc94e0b7fcf08a8524|1685318399|28.107658  |
|0x8df5e62c8c77434547bbf3dd7528e550e80b1734|1685318399|465.070860 |
+------------------------------------------+----------+-----------+
only showing top 10 rows



                                                                                

In [25]:
result_df = token_df.unionAll(from_df).unionAll(to_df)

In [26]:
result_df.show(10, False)



+------------------------------------------+----------+---------------+
|address                                   |timestamp |tx_volume      |
+------------------------------------------+----------+---------------+
|0x2170ed0880ac9a755fd29b2688956bd959f933f8|1685318399|1054768.605324 |
|0xfce146bf3146100cfe5db4129cf6c82b0ef4ad8c|1685318399|0.000047       |
|0x9c65ab58d8d978db963e63f2bfb7121627e3a739|1685318399|14030.151616   |
|0xa4080f1778e69467e905b8d6f72f6e441f9e9484|1685318399|1657.252888    |
|0xbf5140a22578168fd562dccf235e5d43a02ce9b1|1685318399|21100.271899   |
|0x02caa44eb838fc0e49b73213d9d22e5f23798fda|1685318399|364.769479     |
|0x9fb9a33956351cf4fa040f65a13b835a3c8764e3|1685318399|223.021360     |
|0x4338665cbb7b2485a8855a139b75d5e34ab0db94|1685318399|161.363328     |
|0x156ab3346823b651294766e23e6cf87254d68962|1685318399|24406589.595120|
|0xe552fb52a4f19e44ef5a967632dbc320b0820639|1685318399|0.743871       |
+------------------------------------------+----------+---------

                                                                                

In [None]:
# result_df.repartition(1) \
#     .write \
#     .option("header",True) \
#     .csv(basePath + "/tx-volumes/")