Run command `nc -lk 9999` and provide text later periodicly

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

spark = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

## Socket

In [None]:
schema = StructType().add("name", "string").add("age", "integer")

lines = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9092) \
    .load()

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

In [None]:
wordCounts

In [None]:
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()

In [None]:
lines.isStreaming
lines.printSchema()

## Spark Structured Stream with Kafka

In [None]:
from pyspark.sql.functions import (
    col, from_json, to_timestamp, from_unixtime, window, sum)
from pyspark.sql.types import (
    StructType, StructField, TimestampType, DoubleType, StringType,
    IntegerType)
import os
from pyspark.sql import SparkSession

os.environ['PYSPARK_SUBMIT_ARGS'] = \
    '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

spark = SparkSession.builder.appName('Pyspark_kafka').getOrCreate()
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "timeseries-topic") \
        .load()


# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
#     .writeStream.format('console').start().awaitTermination()
schema = StructType([
    # StructField('timestamp', StringType()),
    StructField('value', DoubleType())
])
# df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
#     .select(from_json(col("value"), schema).alias('json')) \
#     .select('json.*') \
#     .withColumn('timestamp', to_timestamp(col('timestamp'), 'yyyy-MM-dd HH:mm:ss'))
df = df.selectExpr('CAST(timestamp AS TIMESTAMP)', 'CAST(value AS STRING)') \
    .select('timestamp', from_json(col("value"), schema).alias('json')) \
    .select('timestamp', 'json.*')

df = df.withWatermark("timestamp", "1 minute") \
    .groupBy(window(col("timestamp"), "1 minute")) \
    .agg(sum('value').alias('sum')) \
    .select('window.*', 'sum')

df.printSchema()

In [None]:
df.writeStream.format('console').start().awaitTermination()