In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from time import sleep

In [2]:
spark = SparkSession.builder.appName('Basic streaming').getOrCreate()

In [4]:
#Get schema of dataset 
static = spark.read.json("D:/Dataset/activity-data")
static.printSchema()
dataSchema = static.schema

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)



In [5]:
# Create Streming Dataset
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
  .json("D:/Dataset/activity-data")

In [6]:
# Get required info from dataset
activityCounts = streaming.groupBy("gt").count()

In [7]:
# Specify the numner of shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", 1)

In [8]:
# Generate query by specifying output sink using writeStream 
activityQuery = activityCounts.writeStream.queryName("activity_count")\
  .format("memory").outputMode("complete")\
  .start()

In [9]:
# Any number of queries (usecases) can be created
#userCounts = streaming.groupBy("").count()
userCounts = streaming.groupBy('User').count()
userQuery = userCounts.writeStream.queryName("user_count")\
  .format("memory").outputMode("complete")\
  .start()

In [10]:
# Display result
for x in range(5):
    spark.sql('SELECT * FROM activity_count').show()
    spark.sql('SELECT * FROM user_count').show()
    sleep(10)

+----------+-----+
|        gt|count|
+----------+-----+
|       sit|12309|
|      walk|13256|
|  stairsup|10452|
|      bike|10796|
|      null|10449|
|     stand|11384|
|stairsdown| 9365|
+----------+-----+

+----+-----+
|User|count|
+----+-----+
|   d| 8124|
|   c| 7715|
|   g| 9167|
|   h| 7733|
|   e| 9603|
|   i| 9255|
|   f| 9206|
|   b| 9123|
|   a| 8085|
+----+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|  stairsup|52260|
|     stand|56924|
|       sit|61547|
|      walk|66280|
|      bike|53984|
|      null|52239|
|stairsdown|46825|
+----------+-----+

+----+-----+
|User|count|
+----+-----+
|   h|38665|
|   e|48015|
|   d|40620|
|   c|38575|
|   g|45839|
|   i|46275|
|   f|46030|
|   b|45615|
|   a|40425|
+----+-----+

+----------+------+
|        gt| count|
+----------+------+
|  stairsup|104521|
|     stand|113849|
|       sit|123085|
|      walk|132560|
|      bike|107974|
|      null|104482|
|stairsdown| 93648|
+----------+------+

+----+-----+
|User|

In [None]:
# Run nonstop
activityQuery.awaitTermination()