<a href="https://colab.research.google.com/github/ShovalBenjer/Bigdata_Pyspark_Apache/blob/main/Streaming_options.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Queue Stream (RDDs)
In this case, we simulate a queue of sentences and process each sentence in batches. For each batch, we need to:

Identify the longest word in the batch.
Identify the most frequently used word in the batch.

In [5]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if 'sc' in globals():
    sc.stop()


# Initialize SparkContext and StreamingContext
sc = SparkContext("local[2]", "QueueStream")
ssc = StreamingContext(sc, 5)  # 5 seconds batch interval

# Define a queue of RDDs containing sentences
sentences = [
    "The Walt Disney Studios has acquired the worldwide distribution rights to acclaimed filmmaker Peter Jackson’s previously announced Beatles documentary.",
    "The film will showcase the warmth, camaraderie and humor of the making of the legendary band’s studio album, 'Let It Be,' and their final live concert as a group, the iconic rooftop performance on London’s Savile Row.",
    "'The Beatles: Get Back' will be released by The Walt Disney Studios in the United States and Canada on September 4, 2020, with additional details and dates for the film’s global release to follow.",
    "The power of rock and roll is a constantly amazing process of this group.",
    "Although it is Bob Dylan who is the single most important figure in rock and roll; and although it is the Rolling Stones who are the embodiment of a rock and roll band; it is nonetheless Our Boys.",
    "The Beatles, who are the perfect product and result of everything that rock and roll means and encompasses."
]

# Create an RDD queue stream
rdd_queue = [ssc.sparkContext.parallelize(sentence.split()) for sentence in sentences]
queue_stream = ssc.queueStream(rdd_queue)

# Process each batch of words in the stream
def process_batch(rdd):
    # Find the longest word
    longest_word = rdd.reduce(lambda a, b: a if len(a) > len(b) else b)

    # Find the most frequent word
    word_counts = rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
    most_used_word = word_counts.transform(lambda rdd: rdd.sortBy(lambda x: -x[1]).take(1))

    # Output the results
    print(f"Longest Word in Batch: {longest_word}")
    print(f"Most Used Word in Batch: {most_used_word[0][0]} (Count: {most_used_word[0][1]})")

# Apply the processing function to the stream
queue_stream.foreachRDD(process_batch)

# Start the computation
ssc.start()
ssc.awaitTermination()


Py4JJavaError: An error occurred while calling o644.start.
: java.lang.IllegalStateException: Only one StreamingContext may be started in this JVM. Currently running StreamingContext was started atorg.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:563)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.base/java.lang.reflect.Method.invoke(Method.java:566)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:282)
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
py4j.commands.CallCommand.execute(CallCommand.java:79)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:829)
	at org.apache.spark.streaming.StreamingContext$.org$apache$spark$streaming$StreamingContext$$assertNoOtherContextIsActive(StreamingContext.scala:776)
	at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:582)
	at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:563)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


Socket Stream
A socket stream allows us to receive data in real-time, e.g., from a network socket. The task is to process batches of words arriving every 5 seconds and track the most used word, including words seen in previous batches.

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Initialize SparkContext and StreamingContext
sc = SparkContext("local[2]", "SocketStream")
ssc = StreamingContext(sc, 5)  # 5 seconds batch interval

# Create a socket stream that listens on port 9999
socket_stream = ssc.socketTextStream("localhost", 9999)

# Function to update and display the most used word
def update_word_counts(new_words, last_counts):
    return new_words.updateStateByKey(lambda new, last: sum(new) + (last or 0))

# Split the incoming text into words and map to (word, 1) pairs
words = socket_stream.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))

# Update the word counts over time
word_counts = words.reduceByKeyAndWindow(lambda a, b: a + b, windowDuration=15, slideDuration=5)
word_counts = word_counts.updateStateByKey(update_word_counts)

# Display the most used word in the stream
def display_most_used_word(rdd):
    if not rdd.isEmpty():
        most_used_word = rdd.takeOrdered(1, key=lambda x: -x[1])
        print(f"Most Used Word: {most_used_word[0][0]} (Count: {most_used_word[0][1]})")

# Apply the display function
word_counts.foreachRDD(display_most_used_word)

# Start the computation
ssc.start()
ssc.awaitTermination()


File Stream
This stream will watch a directory for newly created files and, for each batch, display the total number of lines in the newly created files.



In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Initialize SparkContext and StreamingContext
sc = SparkContext("local[2]", "FileStream")
ssc = StreamingContext(sc, 5)  # 5 seconds batch interval

# Monitor a directory for new files
directory_stream = ssc.textFileStream("path_to_directory")

# Function to count the number of lines in a file
def count_lines_in_file(rdd):
    if not rdd.isEmpty():
        total_lines = rdd.count()  # Count number of lines in the batch
        print(f"Total lines in batch: {total_lines}")

# Apply the line counting function to the stream
directory_stream.foreachRDD(count_lines_in_file)

# Start the computation
ssc.start()
ssc.awaitTermination()
