### Creating some global variables

In [9]:
kafka_broker = "localhost:9092" #Yet to figure out how the endpoint links to spark
bucket_prefix = "my-company-bucket-prefix-"

### Setting up Locations

To write the Delta table, we need 3 settings: the location of the delta table, the location of the checkpoints and the location of the schema file.

In [10]:
# path_to_bucket = "/mnt/10ac-batch-5/week9/g3"
bucket = "/mnt/10ac-batch-5/week9/g3/speech-to-text-delta"


delta_location = bucket + "/delta-table"
checkpoint_location = bucket + "/checkpoints"
schema_location = bucket + "/kafka_schema.json"


### The Schema
- Assuming the streaming data from kafka is in json format. To properly read this data into spark, we have to provide a schema.
- For efficiency, we will infer the schema one and save it to an s3 location so that every time we save data into the delta lake, we only have to infer rather than re-reading the schema

In [11]:
# ! pip install pyspark==3.3.0

In [15]:
## Making necessary imports
import json, os, re

# from delta.tables import *

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

from delta.tables import *


ModuleNotFoundError: No module named 'delta'

In [16]:
##Method to infer the schema of kafka topic and return it in the json format

def infer_topic_schema_json(topic):

    df_json = (spark.read
               .format("kafka")
               .option("kafka.bootstrap.servers", kafka_broker)
               .option("subscribe", topic)
               .option("startingOffsets", "earliest")
               .option("endingOffsets", "latest")
               .option("failOnDataLoss", "false")
               .load()
               # filter out empty values
               .withColumn("value", expr("string(value)"))
               .filter(col("value").isNotNull())
               # get latest version of each record
               .select("key", expr("struct(offset, value) r"))
               .groupBy("key").agg(expr("max(r) r")) 
               .select("r.value"))
    
    # decode the json values
    df_read = spark.read.json(
      df_json.rdd.map(lambda x: x.value), multiLine=True)
    
    # drop corrupt records
    if "_corrupt_record" in df_read.columns:
        df_read = (df_read
                   .filter(col("_corrupt_record").isNotNull())
                   .drop("_corrupt_record"))
 
    return df_read.schema.json()