In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, LongType, StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
import pyspark.sql.functions as f
from pyspark.sql import Window
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder.appName('Crypto').getOrCreate()
SCHEMA = StructType([
    StructField("dt", LongType(), False),
    StructField("base", StringType(), False),
    StructField("quote", StringType(), False),
    StructField("baseWithdrawalFee", DoubleType(), False),
    StructField("baseWithdrawalFeeType", StringType(), False),
    StructField("quoteWithdrawalFee", DoubleType(), False),
    StructField("quoteWithdrawalFeeType", StringType(), False),
    StructField("exchange", StringType(), False),
    StructField("bidPrice", DoubleType(), False),
    StructField("bidQty", DoubleType(), False),
    StructField("bidFee", DoubleType(), False),
    StructField("askPrice", DoubleType(), False),
    StructField("askQty", DoubleType(), False),
    StructField("askFee", DoubleType(), False),
])
df = spark.read.options(delimiter='\t', ).csv("../logs/logs_1743.tsv", header=False, schema=SCHEMA)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/22 18:05:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# df = df.drop("baseWithdrawalFee").drop("baseWithdrawalFeeType").drop("quoteWithdrawalFee").drop("quoteWithdrawalFeeType")
# df[df["takerFee"].isNull()].show()

In [4]:
def get_id(x):
    d = {"binance": 1.0, "poloniex": 2.0, "gate": 3.0, "huobi": 4.0, "kraken": 5.0, "bybit": 6.0, "bitget": 7.0}
    return d[x]

func_id = f.udf(get_id, DoubleType())
df = df.withColumn("idExchange", func_id("exchange"))

In [5]:
# for e in range(1, 6):
#     exch = df[(df["idExchange"] == float(e)) & (df["base"] == "TRX") & (df["quote"] == "USDT")].collect()
#     bid = [e["bidPrice"] for e in exch]
#     ask = [e["askPrice"] for e in exch]
#     dt = [e["dt"] for e in exch]
#     plt.plot(dt, bid, label=f"{e} bid")
#     plt.plot(dt, ask, label=f"{e} ask")
    
# plt.legend()
# plt.show()

In [6]:
test = df.groupBy(['dt', "base", "quote"])\
         .agg(f.collect_list(f.struct("idExchange", "bidPrice", "bidQty", "askPrice", "askQty", "bidFee", "askFee")).alias("data"))

In [7]:
def mult(rows):
    multed = []
    for row_bid in rows:
        for row_ask in rows:
            if row_bid["idExchange"] != row_ask["idExchange"]:
                multed.append([row_bid["idExchange"], 
                               row_ask["idExchange"], 
                               row_bid["bidPrice"], 
                               row_bid["bidQty"], 
                               row_ask["askPrice"], 
                               row_ask["askQty"],
                               row_bid["bidFee"],
                               row_ask["askFee"]])
    return multed

In [8]:
func_mult = f.udf(mult, ArrayType(ArrayType(DoubleType())))

In [9]:
test = test.withColumn("multed", func_mult("data"))

In [10]:
def pyspark_len(x):
    return len(x)

func_len = f.udf(pyspark_len, IntegerType())

In [11]:
test = test.withColumn("len", func_len("multed"))

In [12]:
test = test[test["len"] > 0]

In [13]:
test = test.select(test["dt"], test["base"], test["quote"], f.explode("multed"))

In [14]:
test = test.select(test["dt"], test["base"], test["quote"], *[f.col("col")[e] for e in range(8)])

In [15]:
test = test.withColumnRenamed("col[0]", "bidExchange")\
           .withColumnRenamed("col[1]", "askExchange")\
           .withColumnRenamed("col[2]", "bidPrice")\
           .withColumnRenamed("col[3]", "bidQty")\
           .withColumnRenamed("col[4]", "askPrice")\
           .withColumnRenamed("col[5]", "askQty")\
           .withColumnRenamed("col[6]", "bidFee")\
           .withColumnRenamed("col[7]", "askFee")

In [16]:
test = test.withColumn("Qty", f.least("bidQty", "askQty"))\
           .withColumn("revenue", (f.col("bidPrice") * (1 - f.col("bidFee")) - f.col("askPrice") / (1 - f.col("askFee"))) * f.col("Qty"))

In [17]:
# import numpy as np

# test[test["revenue"].isNull()].show()

In [18]:
test3 = test.groupBy(["base", "quote", "bidExchange", "askExchange"]) \
    .agg(f.collect_list(f.struct("dt", "bidPrice", "askPrice", "Qty", "revenue")).alias("data"))

def calc_avg(x):
    return sum(x) / len(x)


def calc_len(x):
    return len(x) >= 1


def test_len(x):
    return len(x)


def find(rows):
    N = 10 ** 8
    if len(rows) == 1:
        return [N]
    arbitrages = []
    rows.sort(key=lambda x: x["dt"])
    old = rows[0]
    dur = 0
    for row in rows[1:]:
        if row["revenue"] > 0 and old["revenue"] > 0:
            dur += row["dt"] - old["dt"]
        elif row["revenue"] <= 0 and old["revenue"] > 0:
            arbitrages.append(dur + N)
            dur = 0
        old = row
    if row["revenue"] > 0:
        arbitrages.append(dur + N)
    return arbitrages


func = f.udf(find, ArrayType(LongType()))
func2 = f.udf(calc_avg, DoubleType())
func3 = f.udf(calc_len, BooleanType())
func4 = f.udf(test_len, IntegerType())

test3 = test3.withColumn("arbitrations", func("data")) \
    .withColumn("is_not_empty", func3("arbitrations")) \
    .withColumn("len", func4("arbitrations"))
test3 = test3[test3["is_not_empty"] == True].withColumn("avg_arb", func2("arbitrations"))

In [None]:
# test3.show()

In [None]:
test3.sort("avg_arb", ascending=False).show()

