# Spark Streaming

This little demo uses Spark DStreams to ingest words from a stream, determine how long the words are, and then plots the distribution of string lengths over time. You can try feeding in different books from, say, Project Gutenberg to see the distribution change with books from different periods.

Note: see `streamer.sh` for an example of a small program that streams out individual words.

In [None]:
from pyspark.streaming import StreamingContext
# The "1" here is the number of seconds between microbatches:
ssc = StreamingContext(sc, 1)

# Required to be able to do state updates:
ssc.checkpoint("checkpoint")

In [None]:
# Assumes the stream is running on the same machine as the driver.
# That's not very common, so you'll probably change 'localhost'
# to something else. In fact, using 'localhost' even from the local
# machine seems to be hit or miss.
sock = ssc.socketTextStream("localhost", 8888)

In [None]:
# Updates our distribution
def update_dist(new_values, old_values):
    return sum(new_values) + (old_values or 0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Plots the distribution. This executes on the driver!
def plot_distribution(rdd):
    df = pd.DataFrame(rdd.collect(), columns=['word size', 'frequency'])
    df.sort_values('word size', ascending=True).plot(kind='bar', x='word size')

In [None]:
# Get word lengths and emit (len, 1) pairs
lengths = sock.map(lambda word: len(word))
counts = lengths.map(lambda length: (length, 1))

# Count the instances of the lengths, then
# add them to our stored state
reduced = counts.reduceByKey(lambda x, y: x + y)
distrib = reduced.updateStateByKey(update_dist)

# Plot the new distribution for each microbatch
distrib.foreachRDD(plot_distribution)

# Print out the distribution for good measure
distrib.pprint()

In [None]:
# Running this will start listening:
ssc.start()

In [None]:
# IMPORTANT: you need the stopSparkContext=False, otherwise
# your driver will die and you'll have to restart Jupyter
ssc.stop(stopSparkContext=False)