In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f
from pyspark.sql import Window
import findspark

findspark.init()
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    
])
df = spark.read.options(delimiter='\t', ).csv("../logs/test_dict3.tsv", header=False, schema=SCHEMA)

### средняя разница времени записи

In [4]:
dts = sorted(set([i["dt"] for i in df.take(df.count())]))
dif = [dts[i] - dts[i-1] for i in range(1, len(dts))]
from statistics import median
print(sum(dif) / len(dif))
print(median(dif))
print(max(dif))

102478454.52961673
100730500.0
200478000


In [None]:
w = Window.partitionBy(['dt', "base", "quote"])

bids = df.withColumn('maxBid', f.max('bidPrice').over(w))\
    .where(f.col('bidPrice') == f.col('maxBid'))\
    .drop('maxBid').withColumnRenamed("exchange", "bidExchange") \
    .drop("askPrice").drop("askQty")
asks = df.withColumn('minAsk', f.min('askPrice').over(w))\
    .where(f.col('askPrice') == f.col('minAsk'))\
    .drop('minAsk').withColumnRenamed("exchange", "askExchange") \
    .drop("bidPrice").drop("bidQty")
test = bids.join(asks, on=["dt", "base", "quote"]) \
test = test.withColumn("Qty", f.least("bidQty", "askQty")) \
    .withColumn("revenue", (f.col("bidPrice") - f.col("askPrice")) * f.col("Qty"))

test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty", "revenue")).alias("data"))

In [121]:
def calc_avg(x):
    return sum(x) / len(x)


def calc_len(x):
    return len(x) >= 1


def test(x):
    return len(x)


def find(rows):
    N = 10 ** 8
    if len(rows) == 1:
        return [N]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old = rows[0]
    dur = 0
    for row in rows[1:]:
        if row["revenue"] > 0 and old["revenue"] > 0:
            dur += row["dt"] - old["dt"]
        elif row["revenue"] <= 0 and old["revenue"] > 0:
            arbitrages.append(dur + N)
            dur = 0
        old = row
    if row["revenue"] > 0:
        arbitrages.append(dur + N)
    return arbitrages


func = f.udf(find, ArrayType(LongType()))
func2 = f.udf(calc_avg, DoubleType())
func3 = f.udf(calc_len, BooleanType())
func4 = f.udf(test, IntegerType())

In [122]:
test3 = test3.withColumn("arbitrations", func("data")) \
    .withColumn("is_not_empty", func3("arbitrations")) \
    .withColumn("len", func4("arbitrations"))
test3 = test3[test3["is_not_empty"] == True].withColumn("avg_arb", func2("arbitrations"))

### Метрики

In [123]:
test3.sort("len", ascending=False).show()

+-----+-----+-----------+-----------+--------------------+--------------+------------+---+--------------+
| base|quote|bidExchange|askExchange|                data|  arbitrations|is_not_empty|len|       avg_arb|
+-----+-----+-----------+-----------+--------------------+--------------+------------+---+--------------+
|  ASK| USDT|       gate|       gate|[{167154826696461...|   [100000000]|        true|  1|         1.0E8|
| BABY| USDT|       gate|      huobi|[{167154814091871...|[126246133700]|        true|  1|1.262461337E11|
|  ETC| USDT|      huobi|      huobi|[{167154808893843...|   [100000000]|        true|  1|         1.0E8|
|  FTM|  BTC|   poloniex|    binance|[{167154809237532...|[174689286300]|        true|  1|1.746892863E11|
|  GMT| USDT|   poloniex|    binance|[{167154808883819...|[178226418000]|        true|  1| 1.78226418E11|
|  HFT| USDT|      huobi|    binance|[{167154815452874...|   [100000000]|        true|  1|         1.0E8|
|LAZIO| USDT|    binance|   poloniex|[{1671548

In [104]:
test3.select(f.avg("avg_arb")).show()

+--------------------+
|        avg(avg_arb)|
+--------------------+
|5.369866041818181...|
+--------------------+



In [139]:
def get_id(x):
    d = {"binance": 1.0, "poloniex": 2.0, "gate": 3.0, "huobi": 4.0, "kraken": 5.0}
    return d[x]

func_id = f.udf(get_id, DoubleType())
df = df.withColumn("idExchange", func_id("exchange"))

In [140]:
df.show(20)

+-------------------+--------+-----+--------+--------+-------------+---------+-------------+----------+
|                 dt|    base|quote|exchange|bidPrice|       bidQty| askPrice|       askQty|idExchange|
+-------------------+--------+-----+--------+--------+-------------+---------+-------------+----------+
|1671548087626431900|    LINK| USDT|poloniex|  6.0428|      1.36753|   6.0713|       2.8205|       2.0|
|1671548087626431900|     ETH| USDT|poloniex| 1206.37|    62.021761|  1207.33|     5.617459|       2.0|
|1671548087727163100|    LINK| USDT|poloniex|  6.0428|      1.36753|   6.0713|       2.8205|       2.0|
|1671548087727163100|     ETH| USDT|poloniex| 1206.37|    62.021761|  1207.33|     5.617459|       2.0|
|1671548087727163100|     TRX| USDT|poloniex| 0.05436|       58.867|  0.05437|   451461.546|       2.0|
|1671548087727163100|     SUN| USDT|poloniex|0.005276|   1735755.43|0.0052934|    724549.91|       2.0|
|1671548087727163100|POLYDOGE| USDT|poloniex|  3.1E-9|7.65283137