In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, IntegerType, LongType, StringType, FloatType, DoubleType
from datetime import datetime
import json

In [3]:
spark = SparkSession.builder.appName("test").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/18 19:53:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
logs_schema = StructType() \
    .add("dt", LongType(), True) \
    .add("base", StringType(), True) \
    .add("quote", StringType(), True) \
    .add("exchange", StringType(), True) \
    .add("bidPrice", DoubleType(), True)\
    .add("bidQty", DoubleType(), True)\
    .add("askPrice", DoubleType(), True)\
    .add("askQty", DoubleType(), True)\

df = spark.read.format("csv").option("sep", "\t").schema(logs_schema).load("../logs2.tsv")

In [12]:
df.show()

+-------------------+------+-----+--------+---------+-----------+---------+---------+
|                 dt|  base|quote|exchange| bidPrice|     bidQty| askPrice|   askQty|
+-------------------+------+-----+--------+---------+-----------+---------+---------+
|1671371812147545300|  ETHW|  ETH|poloniex|  0.00254|      6.935|  0.00257|    79.83|
|1671371812244287200|  ETHW|  ETH|poloniex|  0.00254|      6.935|  0.00257|    79.83|
|1671371812244287200|  ETHW| USDD|poloniex|     3.03|       30.0|    3.162|     30.0|
|1671371812244287200|  ETHW| USDT|poloniex|    3.006|       30.0|    3.038|143.32018|
|1671371812244287200|  LUNC| USDT|poloniex|1.3893E-4|  2705996.0|1.4916E-4|2317449.0|
|1671371812244287200|SANTOS| USDT|poloniex|   5.1317|   113.8612|   5.3903|   0.5108|
|1671371812244287200|SANTOS| USDD|poloniex|    5.106|     0.2434|   5.4143|   0.3779|
|1671371812244287200|   XEN| USDD|poloniex|  2.84E-6|1.0959756E7|  3.11E-6| 967137.0|
|1671371812244287200|   XEN| USDT|poloniex|  2.63E-6| 

In [5]:
w = Window.partitionBy(['dt', "base", "quote"])
bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').drop("bidPrice").drop("bidQty")\
    .withColumnRenamed("exchange","exchange_ask")
arb = bids.join(asks, on=["dt", "base", "quote"])\
    .withColumn("Qty", f.least("bidQty", "askQty"))\
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))
arb = arb[arb["revenue"] > 0]
arb.show()  


