In [None]:
import sys
import time
import pyspark

# import for pyspark sql
from pyspark.sql import SparkSession 
from pyspark.sql import Row, SQLContext
from pyspark.sql.types import *

# imports for streaming
from pyspark import SparkContext 
from pyspark.streaming import StreamingContext
from pyspark import StorageLevel

# mllib imports 
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.feature import HashingTF





In [None]:
BATCH_INTERVAL = 1 # base time unit (in seconds)

In [None]:
# Create a local StreamingContext with two working thread and batch interval of 1 second. 
sc = SparkContext("local[2]", "NetworkWordCount").getOrCreate()
ssc = StreamingContext(sc, BATCH_INTERVAL)
ssc.checkpoint("checkpoint")

In [None]:
"""
Spark Streamming getting data from socket

for entering manualy somsing in to socket write "nc -lk 9999" in terminal
"""

def updateFunction(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)  # add the new values with the previous running count to get the new count

# Create a local StreamingContext with two working thread and batch interval of 1 second. 
sc = SparkContext("local[2]", "NetworkWordCount").getOrCreate()
ssc = StreamingContext(sc, 1)

# Create a DStream that will connect to hostname:port, like localhost:9999
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" ")) 		# Split each line into words

# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()
newValues = wordCounts.updateStateByKey(updateFunction)
newValues.pprint()

# Start the computation
ssc.start() 
time.sleep(10)
ssc.stop(stopSparkContext=True, stopGraceFully=True)  


In [None]:
"""
Spark streamming File Stream

Spark Streaming will monitor the directory and process any files created in that directory.
A simple directory can be monitored. All files directly under such a path will be processed as they are discovered.
All files must be in the same data format.
A file is considered part of a time period based on its modification time, not its creation time.
Once processed, changes to a file within the current window will not cause the file to be reread.
That is: updates are ignored.
The more files under a directory, the longer it will take to scan for changes — even if no files have been modified.
Calling FileSystem.setTimes() to fix the timestamp is a way to have the file picked up in a later window,
even if its contents have not changed.
(only textFileStream is available for Python)
"""

ssc.textFileStream("./data")

In [None]:
"""
Spark streamminng Queue of RDDs as a Stream

For testing a Spark Streaming application with test data, one can also create a DStream based on a 
queue of RDDs. Each RDD pushed into the queue will be treated as a batch of data in the DStream,
and processed like a stream.
"""

# Create the queue through which RDDs can be pushed to a QueueInputDStream 
rddQueue = [] 
for i in range(5): 
    rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)] 
 
# Create the QueueInputDStream and use it do some processing 
inputStream = ssc.queueStream(rddQueue) 
mappedStream = inputStream.map(lambda x: (x % 10, 1)) 
reducedStream = mappedStream.reduceByKey(lambda a, b: a + b) 
reducedStream.pprint()  

ssc.start() 
time.sleep(6) 
ssc.stop(stopSparkContext=True, stopGraceFully=True) 

In [None]:
"""
Spark Streamming Windowed DStream

Reduce last 30 seconds of data, every 10 seconds
windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
"""

def get_countryname(line):
    country_name = line.strip()

    if country_name == 'usa':
        output = 'USA'
    elif country_name == 'ind':
        output = 'India'
    elif country_name == 'aus':
        output = 'Australia'
    else:
        output = 'Unknown'

    return (output, 1)

if __name__ == "__main__":
    if len(sys.argv) != 3:
        raise IOError("Invalid usage; the correct format is:\nwindow_count.py <hostname> <port>")

    batch_interval = 1 # base time unit (in seconds)
    window_length = 15 * batch_interval
    frequency = 6 * batch_interval


    #lines = ssc.socketTextStream(sys.argv[1], int(float(sys.argv[2]))
    lines = ssc.socketTextStream("localhost", 9999)

    window_counts = lines.map(get_countryname).reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 
               window_length, frequency)
    window_counts.pprint()

    ssc.start()
    ssc.awaitTermination()