In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f
from pyspark.sql import Window
import findspark

In [103]:
findspark.init()
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    
])
df = spark.read.options(delimiter='\t', ).csv("../logs/not_async4.tsv", header=False, schema=SCHEMA)

In [61]:
w = Window.partitionBy(['dt', "base", "quote"])
bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').withColumnRenamed("exchange", "bidExchange") \
    .drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').withColumnRenamed("exchange", "askExchange") \
    .drop("bidPrice").drop("bidQty")
test = bids.join(asks, on=["dt", "base", "quote"]) \
    .withColumn("Qty", f.least("bidQty", "askQty")) \
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))
test = test[test["revenue"] > 0]

In [62]:
test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty")).alias("data"))

In [63]:
def calc_avg(x):
    return max(x)
    #return sum(x) / len(x)


def calc_len(x):
    return len(x)


def get_values(row):
    return row["dt"], row["bidPrice"], row["askPrice"], row["Qty"]


def find(rows):
    #N = 10**8
    N = int(10 ** 8)
    #N = 1
    if len(rows) == 1:
        return [N]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old_dt = rows[0]["dt"]
    start = old_dt
    for row in rows[1:]:
        new_dt = row["dt"]
        if new_dt - old_dt > N:
            arbitrages.append(old_dt - start + N)
            start = new_dt
        old_dt = new_dt
    arbitrages.append(old_dt - start + N)
    return arbitrages


func = f.udf(find, ArrayType(LongType()))
func2 = f.udf(calc_avg, LongType())
func3 = f.udf(calc_len, IntegerType())

In [64]:
test3 = test3.withColumn("arbitrations", func("data")) \
    .withColumn("avg_arb", func2("arbitrations"))

In [39]:
test3.sort("avg_arb", ascending=False).show()

+----+-----+-----------+-----------+--------------------+--------------------+---------+
|base|quote|bidExchange|askExchange|                data|        arbitrations|  avg_arb|
+----+-----+-----------+-----------+--------------------+--------------------+---------+
|BUSD| USDT|    binance|   poloniex|[{167147654957338...|[100000000, 10000...|200000000|
|NEAR| USDT|    binance|   poloniex|[{167147657983808...|[100000000, 10000...|200000000|
| CHZ| USDT|      huobi|   poloniex|[{167147655594898...|[100000000, 10000...|200000000|
| ADA| USDT|      huobi|    binance|[{167147654977484...|[100000000, 10000...|200000000|
|CTSI| USDT|   poloniex|       gate|[{167147655189518...|[100000000, 10000...|200000000|
| ETH| USDT|    binance|      huobi|[{167147655119006...|[100000000, 10000...|200000000|
|ETHW|  ETH|     kraken|   poloniex|[{167147655058567...|[100000000, 10000...|200000000|
|ATOM|  BTC|   poloniex|     kraken|[{167147655088787...|[100000000, 10000...|200000000|
| FIL|  BTC|   poloni

In [21]:
test3.select(f.avg("avg_arb")).show()

+------------+
|avg(avg_arb)|
+------------+
|        null|
+------------+



In [104]:
dts = sorted(set([i["dt"] for i in df.select("dt").collect()]))

In [105]:
dif = [dts[i] - dts[i - 1] for i in range(1, len(dts))]

In [106]:
sum(dif) / len(dif)

102431706.044905

In [70]:
from statistics import median

In [107]:
median(dif)

100730800

In [89]:
with open("test3.txt", 'w') as file:
    for i in dif:
        file.write(str(i) + '\n')

In [108]:
max(dif)

171542200