In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType
import pyspark.sql.functions as f
from pyspark.sql import Window
import findspark

In [2]:
findspark.init()
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    
])
df = spark.read.options(delimiter='\t', ).csv("../logs/logs3.tsv", header=False, schema=SCHEMA)

In [3]:
w = Window.partitionBy(['dt', "base", "quote"])
bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').withColumnRenamed("exchange", "bidExchange") \
    .drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').withColumnRenamed("exchange", "askExchange") \
    .drop("bidPrice").drop("bidQty")
test = bids.join(asks, on=["dt", "base", "quote"]) \
    .withColumn("Qty", f.least("bidQty", "askQty")) \
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))
test = test[test["revenue"] > 0]

In [4]:
test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty")).alias("data"))

In [14]:
def get_values(row):
    return row["dt"], row["bidPrice"], row["askPrice"], row["Qty"]


def find(rows):
    if len(rows) == 1:
        return [10**8]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old_dt = rows[0]["dt"]
    start = old_dt
    for row in rows[1:]:
        new_dt, bid, ask, qty = get_values(row)
        if new_dt - old_dt > 10**8:
            arbitrages.append(old_dt - start)
            start = new_dt
        old_dt = new_dt
    #if rows[-1]["dt"] - rows[-2]["dt"] > 10**8:
    #    arbitrages.append(10**8)
    return arbitrages

func = f.udf(find, ArrayType(LongType()))

In [10]:
def calc(rows):
    if len(rows) == 1:
        return 0
    old = rows[0]["dt"]
    c = 0
    for row in rows[1:]:
        new = row["dt"]
        if new - old <= 10**8:
            c += 1
        old = new
    return c


func3 = f.udf(calc, LongType())

In [11]:
test3 = test3.withColumn("calc", func3("data"))

In [12]:
test3.select(f.sum("calc")).show()

+---------+
|sum(calc)|
+---------+
|      720|
+---------+



In [15]:
test3 = test3.withColumn("arbitrations", func("data"))

In [16]:
def calc_avg(x):
    return sum(x) / len(x)
    

func2 = f.udf(calc_avg, DoubleType())
test3 = test3.withColumn("avg_arb", func2("arbitrations"))

In [8]:
test3[test3["avg_arb"] > 10**8].count()

34

In [None]:
test.sort("avg_arb", ascending=False).show()

In [17]:
a = test3.take(1)

In [18]:
a

[Row(base='VEMP', quote='USDT', bidExchange='gate', askExchange='huobi', data=[Row(dt=1671433025246882100, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433026000375600, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433026100835800, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433026302201200, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433026402596900, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433026503119200, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433027005694700, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433027307166800, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433027708813800, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433027809801700, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433028010612700, bidPrice=0.01801, askPrice=0.01793, Qty=1046.745), Row(dt=1671433028311974700, bidPrice=0.01801, askPri