In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, window
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.dataframe import DataFrame

from time import sleep

In [6]:
TOPIC = "market"
KAFKA_SERVER = "localhost"
KAFKA_PORT = 9094
KAFKA_CLIENT_VERSION = "3.7.0"

SCALA_VERSION = '2.12'
SPARK_VERSION = '3.5.1'
SPARK_MASTER = "local[*]"
SHUFFLE_PARTITIONS = 5

CASSANDRA_SERVER = "localhost"
CASSANDRA_PORT = 9042

APP_NAME = "BigDataStreaming"

In [3]:
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION}',
    f'org.apache.kafka:kafka-clients:{KAFKA_CLIENT_VERSION}',
    f'org.apache.spark:spark-avro_{SCALA_VERSION}:{SPARK_VERSION}',
    f"com.datastax.spark:spark-cassandra-connector-assembly_{SCALA_VERSION}:3.5.0"
]

spark = SparkSession.builder\
   .master(SPARK_MASTER)\
   .appName(APP_NAME)\
   .config("spark.sql.shuffle.partitions", f'{SHUFFLE_PARTITIONS}')\
   .config("spark.jars.packages", ",".join(packages))\
   .config("spark.cassandra.connection.host",f"{CASSANDRA_SERVER}:{CASSANDRA_PORT}")\
   .getOrCreate()
spark

your 131072x1 screen size is bogus. expect trouble
24/04/19 20:18:00 WARN Utils: Your hostname, furyPIRATE resolves to a loopback address: 127.0.1.1; using 172.29.240.184 instead (on interface eth0)
24/04/19 20:18:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/furypirate/.local/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/furypirate/.ivy2/cache
The jars for the packages stored in: /home/furypirate/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
com.datastax.spark#spark-cassandra-connector-assembly_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-be6c327d-e4de-4df4-9f18-e927c064c12a;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.c

In [4]:
market_stream: DataFrame = spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers", f"{KAFKA_SERVER}:{KAFKA_PORT}")\
    .option("subscribe", "market")\
    .option("startingOffsets", "earliest")\
    .option("failOnDataLoss", "false")\
    .load()
market_stream.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



Dataframe transformation query to extract AVRO data into corresponding fields, and format the timestamp correctly

In [7]:
avro_schema = open("../finnhub/trade.avsc", "r").read()

trades_stream = market_stream\
    .withColumn("trade_data", from_avro("value", avro_schema))\
    .select("trade_data", "offset")\
    .select("trade_data.*", "offset")\
    .select(explode("data"),"type", "offset")\
    .select("col.*", "offset")\
    .selectExpr("p as price", "s as symbol", "v as volume", "cast(cast(t as double) / 1000 as timestamp) as event_time", "offset") 

trades_stream.printSchema()

trades_stream.writeStream\
    .queryName("trades")\
    .format("org.apache.spark.sql.cassandra") \
    .option("checkpointLocation", '/tmp/checkpoint/trades/') \
    .options(table="trades",keyspace="market") \
    .outputMode("append")\
    .start()

root
 |-- price: double (nullable = false)
 |-- symbol: string (nullable = false)
 |-- volume: double (nullable = false)
 |-- event_time: timestamp (nullable = true)
 |-- offset: long (nullable = true)



24/04/19 20:18:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f6c20c29400>

                                                                                

query for the minute_trades table to calculate the count and average price of trades on a window of 1 minute

In [None]:
minute_trades_query = trades_stream\
    .groupby(window("event_time", "1 day"))\
    .count()\
    .writeStream\
    .queryName("minute_trades")\
    .format("memory")\
    .outputMode("complete").start()
    # .agg({"*" : "count", "price" : "avg", "offset" : "max"})\
    # .withColumnsRenamed({"avg(price)":"avg_price", "count(1)":"total", "max(offset)":"id"})\
    # .selectExpr("id", "avg_price", "total", "window.end as event_time", "window.start as start")\
    
    

    # .format("org.apache.spark.sql.cassandra") \
    # .option("checkpointLocation", '/tmp/checkpoint/minute_trades/') \
    # .option("confirm.truncate", "true")\
    # .options(table = "minute_trades", keyspace = "market") \
    # .outputMode("Append")\
    # .start()

    # .queryName("minute_trades")\
    # .format("memory")\
    # .outputMode("complete").start()

In [None]:
while 1:
    spark.sql("SELECT count(*) FROM trades").show()
    sleep(1)

In [None]:
spark.stop()