In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f
from pyspark.sql import Window
import findspark

In [27]:
findspark.init()
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    
])
df = spark.read.options(delimiter='\t', ).csv("../logs/new_method.tsv", header=False, schema=SCHEMA)

In [28]:
w = Window.partitionBy(['dt', "base", "quote"])
bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').withColumnRenamed("exchange", "bidExchange") \
    .drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').withColumnRenamed("exchange", "askExchange") \
    .drop("bidPrice").drop("bidQty")
test = bids.join(asks, on=["dt", "base", "quote"]) \
    .withColumn("Qty", f.least("bidQty", "askQty")) \
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))

In [29]:
test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty", "revenue")).alias("data"))

In [30]:
def calc_avg(x):
    return sum(x) / len(x)


def calc_len(x):
    return len(x)


def find(rows):
    N = 10 ** 8
    if len(rows) == 1:
        return [N]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old = rows[0]
    flag = old["revenue"] > 0
    dur = 0
    for row in rows[1:]:
        if row["revenue"] > 0 and flag:
            dur += row["dt"] - old["dt"]
        elif row["revenue"] > 0 and not flag:
            flag = True
        elif row["revenue"] <= 0 and flag:
            flag = False
            arbitrages.append(dur + N)
            dur = 0
        old = row
    arbitrages.append(dur + N)
    return arbitrages


func = f.udf(find, ArrayType(LongType()))
func2 = f.udf(calc_avg, DoubleType())
func3 = f.udf(calc_len, IntegerType())

In [31]:
test3 = test3.withColumn("arbitrations", func("data")) \
    .withColumn("avg_arb", func2("arbitrations"))

In [32]:
test3.sort("avg_arb", ascending=False).show()

+-----+-----+-----------+-----------+--------------------+--------------------+-------+
| base|quote|bidExchange|askExchange|                data|        arbitrations|avg_arb|
+-----+-----+-----------+-----------+--------------------+--------------------+-------+
|  XRP|  BTC|    binance|     kraken|[{167154809418799...|[36245780100, 756...|      8|
|  SOL|  BTC|    binance|     kraken|[{167154809297921...|[1006348300, 9090...|      6|
| MASK| USDT|       gate|    binance|[{167154809912043...|[502922800, 30146...|      6|
|  LTC|  BTC|    binance|     kraken|[{167154817226804...|[3729600600, 5271...|      4|
| MASK| USDT|    binance|       gate|[{167154811489925...|[11909255400, 120...|      4|
| MASK| USDT|       gate|      huobi|[{167154810764755...|[402191900, 26182...|      4|
|  APE| USDT|    binance|      huobi|[{167154810956145...|[2918757400, 1000...|      3|
| MASK| USDT|    binance|      huobi|[{167154811449667...|[100000000, 31788...|      3|
| ATOM| USDT|    binance|      h

In [21]:
test3.select(f.avg("avg_arb")).show()

+------------+
|avg(avg_arb)|
+------------+
|        null|
+------------+



In [12]:
dts = sorted(set([i["dt"] for i in df.select("dt").collect()]))

In [13]:
dif = [dts[i] - dts[i - 1] for i in range(1, len(dts))]

In [14]:
sum(dif) / len(dif)

102713734.65063001

In [6]:
from statistics import median

In [15]:
median(dif)

100730600.0

In [89]:
with open("test3.txt", 'w') as file:
    for i in dif:
        file.write(str(i) + '\n')

In [16]:
max(dif)

626323200

In [9]:
df.show(30)

+-------------+-----+-----+--------+---------+---------+--------+---------+
|           dt| base|quote|exchange| bidPrice|   bidQty|askPrice|   askQty|
+-------------+-----+-----+--------+---------+---------+--------+---------+
|1671534179936|  YFL| USDT|poloniex|     7.53| 3.851261|    8.44| 0.964906|
|1671531969024|REPV2| USDT|poloniex|     4.85|  0.15659|   5.347|  10.0904|
|1671535678904|  BCH|  BTC|poloniex| 0.005974|    25.28|0.006075|    18.56|
|1671535678967|  BCH| USDT|poloniex|   100.78| 0.387301|  100.79| 0.402171|
|1671535648622|SENSO|  BTC|poloniex| 7.921E-6|     11.0|8.391E-6|     37.0|
|1671535677005| ETHW| USDD|poloniex|    2.833|   0.8729|   3.221|     30.0|
|1671535677815| ETHW| USDT|poloniex|    2.949| 29.11399|   3.007|  1.43575|
|1671535582622| LUNA| USDT|poloniex|    1.287|217.45811|    1.29|    256.0|
|1671535576297| LUNC| USDT|poloniex|1.3064E-4|3577128.0| 1.42E-4|  15000.0|
|1671535621204|  XEN| USDD|poloniex|  2.13E-6|2184561.0|  2.5E-6|4021675.0|
|16715319690