In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f
from pyspark.sql import Window
import findspark

In [10]:
findspark.init()
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    
])
df = spark.read.options(delimiter='\t', ).csv("../logs/test_time3.tsv", header=False, schema=SCHEMA)

In [61]:
w = Window.partitionBy(['dt', "base", "quote"])
bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').withColumnRenamed("exchange", "bidExchange") \
    .drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').withColumnRenamed("exchange", "askExchange") \
    .drop("bidPrice").drop("bidQty")
test = bids.join(asks, on=["dt", "base", "quote"]) \
    .withColumn("Qty", f.least("bidQty", "askQty")) \
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))
test = test[test["revenue"] > 0]

In [62]:
test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty")).alias("data"))

In [63]:
def calc_avg(x):
    return max(x)
    #return sum(x) / len(x)


def calc_len(x):
    return len(x)


def get_values(row):
    return row["dt"], row["bidPrice"], row["askPrice"], row["Qty"]


def find(rows):
    #N = 10**8
    N = int(10 ** 8)
    #N = 1
    if len(rows) == 1:
        return [N]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old_dt = rows[0]["dt"]
    start = old_dt
    for row in rows[1:]:
        new_dt = row["dt"]
        if new_dt - old_dt > N:
            arbitrages.append(old_dt - start + N)
            start = new_dt
        old_dt = new_dt
    arbitrages.append(old_dt - start + N)
    return arbitrages


func = f.udf(find, ArrayType(LongType()))
func2 = f.udf(calc_avg, LongType())
func3 = f.udf(calc_len, IntegerType())

In [64]:
test3 = test3.withColumn("arbitrations", func("data")) \
    .withColumn("avg_arb", func2("arbitrations"))

In [39]:
test3.sort("avg_arb", ascending=False).show()

+----+-----+-----------+-----------+--------------------+--------------------+---------+
|base|quote|bidExchange|askExchange|                data|        arbitrations|  avg_arb|
+----+-----+-----------+-----------+--------------------+--------------------+---------+
|BUSD| USDT|    binance|   poloniex|[{167147654957338...|[100000000, 10000...|200000000|
|NEAR| USDT|    binance|   poloniex|[{167147657983808...|[100000000, 10000...|200000000|
| CHZ| USDT|      huobi|   poloniex|[{167147655594898...|[100000000, 10000...|200000000|
| ADA| USDT|      huobi|    binance|[{167147654977484...|[100000000, 10000...|200000000|
|CTSI| USDT|   poloniex|       gate|[{167147655189518...|[100000000, 10000...|200000000|
| ETH| USDT|    binance|      huobi|[{167147655119006...|[100000000, 10000...|200000000|
|ETHW|  ETH|     kraken|   poloniex|[{167147655058567...|[100000000, 10000...|200000000|
|ATOM|  BTC|   poloniex|     kraken|[{167147655088787...|[100000000, 10000...|200000000|
| FIL|  BTC|   poloni

In [21]:
test3.select(f.avg("avg_arb")).show()

+------------+
|avg(avg_arb)|
+------------+
|        null|
+------------+



In [11]:
dts = sorted(set([i["dt"] for i in df.select("dt").collect()]))

In [12]:
dif = [dts[i] - dts[i - 1] for i in range(1, len(dts))]

In [13]:
sum(dif) / len(dif)

109529197.43589744

In [6]:
from statistics import median

In [14]:
median(dif)

109349300.0

In [89]:
with open("test3.txt", 'w') as file:
    for i in dif:
        file.write(str(i) + '\n')

In [15]:
max(dif)

114865200

In [9]:
df.show(30)

+-------------+-----+-----+--------+---------+---------+--------+---------+
|           dt| base|quote|exchange| bidPrice|   bidQty|askPrice|   askQty|
+-------------+-----+-----+--------+---------+---------+--------+---------+
|1671534179936|  YFL| USDT|poloniex|     7.53| 3.851261|    8.44| 0.964906|
|1671531969024|REPV2| USDT|poloniex|     4.85|  0.15659|   5.347|  10.0904|
|1671535678904|  BCH|  BTC|poloniex| 0.005974|    25.28|0.006075|    18.56|
|1671535678967|  BCH| USDT|poloniex|   100.78| 0.387301|  100.79| 0.402171|
|1671535648622|SENSO|  BTC|poloniex| 7.921E-6|     11.0|8.391E-6|     37.0|
|1671535677005| ETHW| USDD|poloniex|    2.833|   0.8729|   3.221|     30.0|
|1671535677815| ETHW| USDT|poloniex|    2.949| 29.11399|   3.007|  1.43575|
|1671535582622| LUNA| USDT|poloniex|    1.287|217.45811|    1.29|    256.0|
|1671535576297| LUNC| USDT|poloniex|1.3064E-4|3577128.0| 1.42E-4|  15000.0|
|1671535621204|  XEN| USDD|poloniex|  2.13E-6|2184561.0|  2.5E-6|4021675.0|
|16715319690