In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName('StructStreaming').getOrCreate()
spark

In [3]:
import pandas as pd
import numpy as np
# partitions
length = 100
names = np.random.choice(['Bob', 'James', 'Marek', 'Johannes', None], length)
amounts = np.random.randint(0, 1000000, length)
country = np.random.choice(
	['United Kingdom', 'Poland', 'USA', 'Germany', None], 
	length
)
df = pd.DataFrame({'name': names, 'amount': amounts, 'country': country})

transactions = spark.createDataFrame(df)
print('Number of partitions: {}'.format(transactions.rdd.getNumPartitions())) # <= 8 cores laptop 
print('Partitioner: {}'.format(transactions.rdd.partitioner))
# print('Partitions structure: {}'.format(transactions.rdd.glom().collect()))

Number of partitions: 8
Partitioner: None
Partitions structure: [[Row(name='Bob', amount=166165, country='United Kingdom'), Row(name='Bob', amount=787156, country='USA'), Row(name='Marek', amount=311201, country='Poland'), Row(name=None, amount=355068, country='Germany'), Row(name='Johannes', amount=505687, country='Germany'), Row(name='Marek', amount=407178, country='Germany'), Row(name=None, amount=162684, country='USA'), Row(name='James', amount=327863, country='USA'), Row(name='Marek', amount=659066, country='Poland'), Row(name='James', amount=504155, country='United Kingdom'), Row(name='Bob', amount=836976, country='Germany'), Row(name='James', amount=607099, country='Poland')], [Row(name=None, amount=278525, country=None), Row(name='James', amount=480807, country='Poland'), Row(name=None, amount=273361, country='Germany'), Row(name='James', amount=718967, country='USA'), Row(name='Bob', amount=71821, country=None), Row(name='James', amount=970197, country='Poland'), Row(name='Bob

In [5]:
transactions.count()

100

In [3]:
lines = spark.readStream \
             .format("socket") \
             .option("host", "localhost") \
             .option("port", 9999) \
             .load()
# Input table

In [4]:
words = lines.select(F.explode(
    F.split(lines.value, " ")
).alias("word"))

In [5]:
wordCounts = words.groupBy("word").count()
type(wordCounts)

pyspark.sql.dataframe.DataFrame

In [8]:
 # Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()

# Full cycle
spark.readStream
  .format("kafka")
  .option("subscribe", "input")
  .load()  ### reading IN
  .groupby('value.case("string") as key')
  .agg(count("*") as "value") ### transforming
  .writeStream()
  .format("kafka")
  .option("topic", "output") ## writing OUT
  .trigger("1 minute")
  .outputMode("update")  ## trigger
  .option("checkpointLocation", "...") 
  .withWaterMark("timestamp" "2 minutes") ## check point
  .start()