## Connecting ASW Kinesis to Databrick

In [None]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
# URL processing
import urllib

# Define the parth
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

#read the path to spark dataframe
access_key_df = spark.read.format("delta").load(delta_table_path)

# Retriving the access key and secret keys
ACCESS_KEY =access_key_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY =access_key_df.select('Secret access key').collect()[0]['Secret access key']
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

### Creating Schema 

In [None]:
# schema for the pin table
schema_pin = StructType([
    StructField("index", IntegerType()),
    StructField("unique_id", StringType()),
    StructField("title", StringType()),
    StructField("description", StringType()),
    StructField("poster_name", StringType()),
    StructField("follower_count", StringType()),
    StructField("tag_list", StringType()),
    StructField("is_image_or_video", StringType()),
    StructField("image_src", StringType()),
    StructField("downloaded", IntegerType()),
    StructField("save_location", StringType()),
    StructField("category", StringType())
])

In [None]:
# schema for the user table
schema_user = StructType([
    StructField("index", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("age", StringType()),
    StructField("date_joined", DateType())
])

In [None]:
#schema for geo table
schema_geo = StructType([
    StructField("index", IntegerType()),
    StructField("country", StringType()),
    StructField("timestamp", StringType()),
    StructField("latitude", FloatType()),
    StructField("longitude", FloatType()),
])

In [None]:
## Importing cleaning functions from batch_data_process
%run "/Users/ugwuegbe@gmail.com/batch_data_processing_in_databricks"
#cleaning_of_pin_data(df_pin)
#cleaning_of_geo_data(df_geo)
 #cleaning_of_user_data(df_user)

## Reading Stream data from AWS Kinesis

In [None]:
# read pin data from kinesis using spark
df_pin = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-0e4753f224a7-pin') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()

In [None]:
# This section of code creates a new DataFrame df_pin from the existing df_pin DataFrame.
df_pin_stream = df_pin \
    .withColumn("jsonData", df_pin["data"].cast("string")) \
    .withColumn("parsedJson", from_json("jsonData", schema_pin)) \
    .select("parsedJson.*")

## Cleaning of pin data

In [None]:
# call the imported pin data cleaning function to clean the streaming data
df_pin_stream = cleaning_of_pin_data(df_pin_stream)

## Write the cleaned stream pin data to delta table 

In [None]:
# Write the Pin data to DBFS
df_pin_stream.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0e4753f224a7_pin_table")

In [None]:
# read geo data from kinesis using spark
df_geo = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-0e4753f224a7-geo') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()

In [None]:
# This section of code creates a new DataFrame df_geo from the existing df_geo DataFrame.

df_geo_stream = df_geo \
    .withColumn("jsonData", df_geo["data"].cast("string")) \
    .withColumn("parsedJson", from_json("jsonData", schema_geo)) \
    .select("parsedJson.*")

## Cleaning of geo Stream data

In [None]:
# call the imported geo data cleaning function to clean the streaming geo data
df_geo_stream = cleaning_of_geo_data(df_geo_stream)

## writ the cleaned stream geo data to delta table

In [None]:
# Write the geo data to DBFS
df_geo1.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0e4753f224a7_geo_tables")

In [None]:
# read user data from kinesis using spark
df_user = spark \
.readStream \
.format('kinesis') \
.option('streamName','streaming-0e4753f224a7-user') \
.option('initialPosition','latest') \
.option('region','us-east-1') \
.option('awsAccessKey', ACCESS_KEY) \
.option('awsSecretKey', SECRET_KEY) \
.load()

In [None]:
# This section of code creates a new DataFrame df_user_stream from the existing df_user DataFrame.

df_user_stream = df_user \
    .withColumn("jsonData", df_user["Data"].cast("string")) \
    .withColumn("parsedJson", from_json("jsonData", schema=schema_user)) \
    .select("parsedJson.*")

## Cleaning of user stream data

In [None]:
# call the imported user data cleaning function to clean the streaming user data
df_user_stream = cleaning_of_user_data(df_user_stream)

## Writing the cleaned streaming user data to delta table

In [None]:
# Write the user data to DBFS
df_user1.writeStream \
  .format("delta") \
  .outputMode("append") \
  .option("checkpointLocation", "/tmp/kinesis/_checkpoints/") \
  .table("0e4753f224a7_user_table")