In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_3-CSV-Complete-Mode") \
    .config("spark.executor.memory", "512mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

In [None]:
spark.sparkContext.setLogLevel("ERROR")


In [None]:
! ls -lh /spark-data/datasets/droplocation


In [None]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [None]:
fileStreamDF = spark.readStream\
                               .option("header", "true")\
                               .schema(schema)\
                               .csv("/opt/spark-data/datasets/droplocation")

In [None]:
# Check whether input data is streaming or not
print(" ")
print("Is the stream ready?")
print(fileStreamDF.isStreaming)


In [None]:
# Print Schema
print(" ")
print("Schema of the input stream: ")
fileStreamDF.printSchema()


In [None]:
    # Create a trimmed version of the input dataframe with specific columns
    # We cannot sort a DataFrame unless aggregate is used, so no sorting here
    # We group by the borough and count the number of records (NOT number of convictions)
    # We have used an aggregation function (orderBy), so can sort the dataframe
recordsPerBorough = fileStreamDF.groupBy("borough")\
                             .count()\
                             .orderBy("count", ascending=False)


In [None]:
# We run in complete mode, so only new rows are processed,
# and existing rows in Result Table are not affected
# The output is written to the console
# We set truncate to false. If true, the output is truncated to 20 chars
# Explicity state number of rows to display. Default is 20  
query = recordsPerBorough.writeStream\
                      .outputMode("complete")\
                      .format("console")\
                      .option("truncate", "false")\
                      .option("numRows", 30)\
                      .start()\
                      .awaitTermination()

In [None]:
# Submit App :
# Submit codes/demo2.py 


In [None]:
spark.stop()