In [1]:
from pyspark.streaming import StreamingContext

# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Create a DStream that will connect to localhost at port 9999
# Start Netcat server: nc -lk 9999 
lines = ssc.socketTextStream('localhost', 9999)

# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
lines.pprint()
wordCounts.pprint()

ssc.start()  # Start the computation
print("Start")
ssc.awaitTermination(20)  # Wait for the computation to terminate
ssc.stop(stopSparkContext=False)  # Stop the StreamingContext without stopping the SparkContext

print("Finished")

Start
-------------------------------------------
Time: 2021-12-06 12:16:30
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:30
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:35
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:35
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:40
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:40
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:45
-------------------------------------------

-------------------------------------------
Time: 2021-12-06 12:16:45
-------------------------------------------

Finished


In [2]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)

# split the rdd into 5 equal-size parts
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
        
# Create a StreamingContext with batch interval of 5 seconds
ssc = StreamingContext(sc, 5)

# Feed the rdd queue to a DStream
lines = ssc.queueStream(rddQueue)

# Do word-counting as before
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Use transform() to access any rdd transformations not directly available in SparkStreaming
topWords = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
topWords.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(25)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2021-12-06 12:16:50
-------------------------------------------
('other', 15486)
('first', 10815)
('many', 9773)
('new', 6272)
('system', 5063)
('american', 4744)
('several', 4545)
('century', 4492)
('same', 4394)
('=', 4313)
...

-------------------------------------------
Time: 2021-12-06 12:16:55
-------------------------------------------
('other', 15319)
('first', 10709)
('many', 9575)
('new', 6242)
('system', 5111)
('american', 4777)
('century', 4538)
('several', 4515)
('same', 4453)
('=', 4361)
...

-------------------------------------------
Time: 2021-12-06 12:17:00
-------------------------------------------
('other', 15346)
('first', 10517)
('many', 9706)
('new', 6218)
('system', 5266)
('american', 4940)
('several', 4615)
('=', 4451)
('century', 4438)
('same', 4270)
...

-------------------------------------------
Time: 2021-12-06 12:17:05
-------------------------------------------
('other', 15196)
('first', 10617)
('many', 

In [3]:
# Find the most positive words in windows of 5 seconds from streaming data

from pyspark.streaming import StreamingContext

def parse_line(l):
    x = l.split("\t")
    return (x[0], float(x[1]))

word_sentiments = sc.textFile("../data/AFINN-111.txt") \
                    .map(parse_line).cache()
    
ssc = StreamingContext(sc, 5)
rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1,1,1,1,1], 123)
lines = ssc.queueStream(rddQueue)

word_counts = lines.flatMap(lambda line: line.split(" ")) \
                   .map(lambda word: (word, 1)) \
                   .reduceByKey(lambda a, b: a + b)

# Determine the words with the highest sentiment values by joining the streaming RDD
# with the static RDD inside the transform() method and then multiplying
# the frequency of the words by its sentiment value
happiest_words = word_counts.transform(lambda rdd: word_sentiments.join(rdd)) \
                            .map(lambda t:
                                 (t[1][0] * t[1][1], t[0])) \
                            .transform(lambda rdd: rdd.sortByKey(False))

happiest_words.pprint()

ssc.start()
ssc.awaitTermination(25)
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2021-11-02 20:09:25
-------------------------------------------
(7890.0, 'great')
(6180.0, 'popular')
(5544.0, 'best')
(4662.0, 'good')
(4242.0, 'important')
(2340.0, 'strong')
(2322.0, 'greater')
(2058.0, 'successful')
(1850.0, 'novel')
(1790.0, 'natural')
...

-------------------------------------------
Time: 2021-11-02 20:09:30
-------------------------------------------
(7578.0, 'great')
(6126.0, 'popular')
(5604.0, 'best')
(4749.0, 'good')
(4172.0, 'important')
(2253.0, 'greater')
(2252.0, 'strong')
(2001.0, 'successful')
(1948.0, 'novel')
(1804.0, 'natural')
...

-------------------------------------------
Time: 2021-11-02 20:09:35
-------------------------------------------
(7893.0, 'great')
(6009.0, 'popular')
(5574.0, 'best')
(4677.0, 'good')
(4298.0, 'important')
(2326.0, 'strong')
(2172.0, 'greater')
(1944.0, 'successful')
(1868.0, 'novel')
(1761.0, 'natural')
...

-------------------------------------------
Time: 2021-11-02 

In [4]:
from pyspark.streaming import StreamingContext

# Stateful word count

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .updateStateByKey(updateFunc)

counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    print("Total distinct words: ", rdd.count())
    print(rdd.take(5))
    print('refinery:', rdd.lookup('refinery')[0])

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

Total distinct words:  51430
[('other', 7782), ('first', 5404), ('many', 4878), ('new', 3219), ('system', 2539)]
refinery: 1
Total distinct words:  76917
[('other', 15486), ('first', 10815), ('many', 9773), ('new', 6272), ('system', 5063)]
refinery: 6
Total distinct words:  97338
[('other', 23129), ('first', 16145), ('many', 14534), ('new', 9363), ('system', 7636)]
refinery: 11
Total distinct words:  114786
[('other', 30805), ('first', 21524), ('many', 19348), ('new', 12514), ('system', 10174)]
refinery: 16
Total distinct words:  130393
[('other', 38452), ('first', 26792), ('many', 24239), ('new', 15618), ('system', 12777)]
refinery: 19
Total distinct words:  144898
[('other', 46151), ('first', 32041), ('many', 29054), ('new', 18732), ('system', 15440)]
refinery: 23
Total distinct words:  158013
[('other', 53728), ('first', 37395), ('many', 33933), ('new', 21787), ('system', 17946)]
refinery: 26
Total distinct words:  170610
[('other', 61347), ('first', 42658), ('many', 38902), ('new',

In [5]:
# MG algorithm for approximate word count

from pyspark.streaming import StreamingContext

k = 10000
threshold = 0
total_decrement = 0

ssc = StreamingContext(sc, 5)
# Provide a checkpointing directory.  Required for stateful transformations
ssc.checkpoint("checkpoint")

rdd = sc.textFile('../data/adj_noun_pairs.txt', 8)
rddQueue = rdd.randomSplit([1]*10, 123)
lines = ssc.queueStream(rddQueue)

def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    newValue = sum(newValues, runningCount) - threshold
    return newValue if newValue > 0 else None
    # add the new values with the previous running count to get the new count

running_counts = lines.flatMap(lambda line: line.split(" "))\
                      .map(lambda word: (word, 1))\
                      .reduceByKey(lambda a, b: a + b) \
                      .updateStateByKey(updateFunc)
            
counts_sorted = running_counts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

def printResults(rdd):
    global threshold, total_decrement 
    rdd.cache()
    print("Total distinct words: ", rdd.count())
    print(rdd.map(lambda x: (x[0], x[1], x[1]+total_decrement)).take(5))
    lower_bound = rdd.lookup('refinery')
    if len(lower_bound) > 0:
        lower_bound = lower_bound[0]
    else:
        lower_bound = 0
    print('refinery:', lower_bound, ',', lower_bound + total_decrement)
    if rdd.count() > k:
        threshold = rdd.zipWithIndex().map(lambda x: (x[1], x[0])).lookup(k)[0][1]
    else:
        threhold = 0
    print("Next threshold = ", threshold)
    total_decrement += threshold
    rdd.unpersist()

counts_sorted.foreachRDD(printResults)

ssc.start()
ssc.awaitTermination(50)
ssc.stop(False)
print("Finished")

Total distinct words:  51430
[('other', 7782, 7782), ('first', 5404, 5404), ('many', 4878, 4878), ('new', 3219, 3219), ('system', 2539, 2539)]
refinery: 1 , 1
Next threshold =  5
Total distinct words:  13971
[('other', 15481, 15486), ('first', 10810, 10815), ('many', 9768, 9773), ('new', 6267, 6272), ('system', 5058, 5063)]
refinery: 1 , 6
Next threshold =  5
Total distinct words:  12164
[('other', 23119, 23129), ('first', 16135, 16145), ('many', 14524, 14534), ('new', 9353, 9363), ('system', 7626, 7636)]
refinery: 1 , 11
Next threshold =  4
Total distinct words:  12317
[('other', 30791, 30805), ('first', 21510, 21524), ('many', 19334, 19348), ('new', 12500, 12514), ('system', 10160, 10174)]
refinery: 2 , 16
Next threshold =  5
Total distinct words:  11650
[('other', 38433, 38452), ('first', 26773, 26792), ('many', 24220, 24239), ('new', 15599, 15618), ('system', 12758, 12777)]
refinery: 0 , 19
Next threshold =  5
Total distinct words:  11396
[('other', 46127, 46151), ('first', 32017, 

In [7]:
from pyspark.streaming import StreamingContext

# Create a queue of RDDs
rddQueue = []
for i in range(5):
    rdd = sc.parallelize([i, i, i, i, i])
    rddQueue += [rdd]
        
# Create a StreamingContext with batch interval of 3 seconds
ssc = StreamingContext(sc, 3)

ssc.checkpoint("checkpoint")

# Feed the rdd queue to a DStream
nums = ssc.queueStream(rddQueue)

# Compute the sum over a sliding window of 9 seconds for every 3 seconds
# slidingSum = nums.reduceByWindow(lambda x, y: x + y, None, 9, 3)
slidingSum = nums.reduceByWindow(lambda x, y: x + y, lambda x, y: x - y, 9, 3)

slidingSum.pprint()

ssc.start()  # Start the computation
ssc.awaitTermination(24)  # Wait for the computation to terminate
ssc.stop(False)
print("Finished")

-------------------------------------------
Time: 2021-11-02 22:00:48
-------------------------------------------
0

-------------------------------------------
Time: 2021-11-02 22:00:51
-------------------------------------------
5

-------------------------------------------
Time: 2021-11-02 22:00:54
-------------------------------------------
15

-------------------------------------------
Time: 2021-11-02 22:00:57
-------------------------------------------
30

-------------------------------------------
Time: 2021-11-02 22:01:00
-------------------------------------------
45

-------------------------------------------
Time: 2021-11-02 22:01:03
-------------------------------------------
35

-------------------------------------------
Time: 2021-11-02 22:01:06
-------------------------------------------
20

-------------------------------------------
Time: 2021-11-02 22:01:09
-------------------------------------------

Finished


In [8]:
# Word count using structured streaming: Complete mode vs update mode

from pyspark.sql.functions import *


lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

word_counts = words.groupBy('word').count()

# Start running the query 
query = word_counts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

Finished


In [9]:
# Append mode with selection condition
# Note: complete mode not supported if no aggregation

from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

long_words = words.filter(length(words['word'])>=3)

# Start running the query 
query = long_words\
        .writeStream\
        .outputMode('append')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

Finished


In [10]:
from pyspark.sql.functions import *

lines = spark\
        .readStream\
        .format('socket')\
        .option('host', 'localhost')\
        .option('port', '9999')\
        .option('includeTimestamp', 'true')\
        .load()
        
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),
                     lines.timestamp)

windowedCounts = words.groupBy(
    window(words.timestamp, "10 seconds", "5 seconds"),
    words.word)\
    .count()

# Start running the query 
query = windowedCounts\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .option('truncate', 'false')\
        .trigger(processingTime='5 seconds') \
        .start()

query.awaitTermination(25)
query.stop()
print("Finished")

Finished
