In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit, from_json, when, asc, desc, count, expr, date_format, unix_timestamp, sum
from pyspark.sql.types import ArrayType, StructType, StructField, LongType, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.window import Window
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment2_Streaming")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Disable correctness check so we can sum the amount of movies
spark.conf.set("spark.sql.streaming.statefulOperator.checkCorrectness.enabled", "false")

dataSchemaIMDB = StructType(
    [
    StructField("id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("type", StringType(), True),
    StructField("genres", StringType(), True),
    StructField("averageRating", DoubleType(), True),
    StructField("numVotes", IntegerType(), True),
    StructField("releaseYear", IntegerType(), True)
    ])

# Because csv.DictReader() reads every value in the row as a string, we have to convert them manually in the processing pipeline
dataSchemaService = StructType(
    [StructField("show_id", StringType(), True),
     StructField("type", StringType(), True),
     StructField("title", StringType(), True),
     StructField("director", StringType(), True),
     StructField("cast", StringType(), True),
     StructField("date_added", StringType(), True),
     StructField("release_year", StringType(), True),
     StructField("rating", StringType(), True),
     StructField("duration", StringType(), True),
     StructField("listed_in", StringType(), True),
     StructField("description", StringType(), True),
     StructField("service", StringType(), True),
     StructField("timestamp_in_ms", LongType(), True)
     ])

# Read from a source 

imdb_data = spark.read.format("csv").schema(dataSchemaIMDB)\
          .load("/home/jovyan/data/assignment/imdb_data.csv")

# Update the 'type' column to change 'movie' to 'Movie'
imdb_data = imdb_data.withColumn(
    "type",
    when(col("type") == "movie", "Movie").otherwise(col("type"))
)

# Update the 'type' column to change 'tvSeries' to 'TV Show'
imdb_data = imdb_data.withColumn(
    "type",
    when(col("type") == "tvSeries", "TV Show").otherwise(col("type"))
)

imdb_data.show(10)

# Read the whole Netflix dataset as a batch
serviceStream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("failOnDataLoss", "false") \
    .option("subscribe", "scraper") \
    .option("startingOffsets", "latest") \
    .load()

df_services = serviceStream.selectExpr("CAST(value AS STRING)")

df_services_good = df_services.select(from_json(df_services.value, dataSchemaService.simpleString()))

df_services_good.printSchema()

data_services = df_services_good.select(col("from_json(value).*"))

data_services.printSchema()

# Cast 'release_year' from string to integer
data_services = data_services.withColumn(
    "release_year",
    col("release_year").cast(IntegerType())
)

# create the event time column 
timed_data_services = data_services.selectExpr(
    "*",
    "cast(timestamp_in_ms/1000.0 as timestamp) as event_time")

timed_data_services.printSchema()

rated_data = timed_data_services.join(imdb_data, ["title", "type"], "inner")

rated_data = rated_data.withColumn(
    "averageRating",
    col("averageRating").cast(DoubleType())
)

# top_score = rated_data \
#     .where(col("averageRating").isNotNull()) \
#     .groupBy("service", "averageRating") \
#     .agg(
#         # If-statement in sql starts with CASE and ends with END, this creates a column where if the averageRating >= 9 gets a 1
#         expr("CASE WHEN averageRating >= 9.0 THEN 1 ELSE 0 END").alias("Score >9"),
#         expr("CASE WHEN (averageRating >= 8.0 AND averageRating < 9.0) THEN 1 ELSE 0 END").alias("Score 8-9")
#     )

top_score = rated_data \
    .where(col("averageRating").isNotNull()) \
    .withWatermark("event_time", "5 minutes") \
    .groupBy(window(col("event_time"), "15 seconds"), "service") \
    .agg(
        # If-statement in sql starts with CASE and ends with END, this creates a column where if the averageRating >= 9 gets a 1 and then sums it up
        expr("SUM(CASE WHEN averageRating >= 9.0 THEN 1 ELSE 0 END)").alias("Score >9"),
        expr("SUM(CASE WHEN (averageRating >= 8.0 AND averageRating < 9.0) THEN 1 ELSE 0 END)").alias("Score 8-9")
    ) \

# Using ascending so the last line in the consumers console is the latest data
ordered_df = top_score.orderBy([asc("window.start"), desc("Score >9"), desc("Score 8-9")])

above_9 = ordered_df.select("service", "Score >9") \
    .groupBy("service") \
    .sum() \
    .select(col("service").alias("key"), col("sum(Score >9)").alias("value")) \
    .orderBy(desc("value")) \
    .withColumn("value", col("value").cast(StringType()))
between_8_9 = ordered_df.select("service", "Score 8-9") \
    .groupBy("service") \
    .sum() \
    .select(col("service").alias("key"), col("sum(Score 8-9)").alias("value")) \
    .orderBy(desc("value")) \
    .withColumn("value", col("value").cast(StringType()))

top10_df = ordered_df.limit(3)    

query_above_9 = above_9 \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("checkpointLocation", "/home/jovyan/checkpoint/assignment/top_movies") \
    .option("topic", "top_movies") \
    .outputMode("complete") \
    .start()

query_between_8_9 = between_8_9 \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("checkpointLocation", "/home/jovyan/checkpoint/assignment/subtop_movies") \
    .option("topic", "subtop_movies") \
    .outputMode("complete") \
    .start()
try:
    query_above_9.awaitTermination()
    query_between_8_9.awaitTermination()
except KeyboardInterrupt:
    query_above_9.stop()
    query_between_8_9.stop()
    
    # Stop the spark context
    spark.stop()
    print("Stopped the streaming query and the spark context")

+---------+--------------------+-----+--------------------+-------------+--------+-----------+
|       id|               title| type|              genres|averageRating|numVotes|releaseYear|
+---------+--------------------+-----+--------------------+-------------+--------+-----------+
|       id|               title| type|              genres|         NULL|    NULL|       NULL|
|tt0000009|          Miss Jerry|Movie|             Romance|          5.4|     215|       1894|
|tt0000147|The Corbett-Fitzs...|Movie|Documentary, News...|          5.2|     539|       1897|
|tt0000502|            Bohemios|Movie|                NULL|          4.4|      18|       1905|
|tt0000574|The Story of the ...|Movie|Action, Adventure...|          6.0|     941|       1906|
|tt0000591|    The Prodigal Son|Movie|               Drama|          5.7|      28|       1907|
|tt0000615|  Robbery Under Arms|Movie|               Drama|          4.3|      27|       1907|
|tt0000630|              Hamlet|Movie|            

In [73]:
# Stop the spark context
spark.stop()
print("Stopped the spark context")

Stopped the spark context
