## ADD spark streams

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages \
        org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.5 \
        pyspark-shell'

In [2]:
import json
from pyspark import SparkContext, Row
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [3]:
def get_spark_session(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']

In [4]:
def data_processing(time, rdd):
    
    print(f"===========-----> {str(time)} <-----===========")

    try:
        spark = get_spark_session(rdd.context.getConf())
        
        filtered_list = ['Click', 'Like', 'Complete']

        row_rdd = rdd \
                .map(lambda r_json: Row(epk_id=r_json['epk_id'],
                                        content_id=r_json['content_id'],
                                        event_type=r_json['event_type'],
                                        event_ts=r_json['event_ts'],
                                        insert_ts=r_json['insert_ts'])) \
                .filter(lambda row: row['event_type'] in filtered_list)
                                       
        result_df = spark.createDataFrame(row_rdd)
        result_df.createOrReplaceTempView("treasury_stream")

        result_df.show(n=3)

        # Insert into DB
        try:
            # From database.env
            db_host = os.environ['POSTGRES_HOST']
            db_name = os.environ['POSTGRES_DB']
            db_user = os.environ['POSTGRES_USER']
            db_pass = os.environ['POSTGRES_PASSWORD']
            db_table = 'clickstream_filtered'
            
            testResultDataFrame.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", 'org.postgresql.Driver') \
                .option("url", f"jdbc:postgresql://{db_host}:5432/{db_name}") \
                .option("dbtable", db_table) \
                .option("user", db_user) \
                .option("password", db_pass) \
                .save()

        except Exception as e:
            print("--> Opps! It seems an Errrorrr with DB working!", e)

    except Exception as e:
        print("--> Opps! Is seems an Error!!!", e)

In [5]:
def create_context(kafka_server, kafka_topic):

    sc = SparkContext(appName="PythonStreamingKafka")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 5)

    try:
        directKafkaStream = KafkaUtils.createDirectStream(ssc,
                                        [kafka_topic],
                                        {"metadata.broker.list": kafka_server})
    except:
        raise ConnectionError(f"Kafka error: Connection refused: \
                            broker_list={kafka_server} topic={kafka_topic}")
        
    parsed_lines = directKafkaStream.map(lambda v: json.loads(v[1]))

    # RDD handling
    parsed_lines.foreachRDD(data_processing)

    return ssc

In [9]:
server = 'kafka:9093'
topic = 'clickstream'
output_path = '/tmp/spark/checkpoint_01'

ssc = StreamingContext.getOrCreate(output_path, lambda: create_context(server, topic))
ssc.start()
# ssc.awaitTermination()

--> Opps! Is seems an Error!!! RDD is empty
--> Opps! Is seems an Error!!! RDD is empty
+----------+------+----------+----------+-------------------+
|content_id|epk_id|  event_ts|event_type|          insert_ts|
+----------+------+----------+----------+-------------------+
|      3433|   411|1586097750|     Click|1.586272772301876E9|
|      3425|  3461|1585097759|     Click|1.586272772313592E9|
|       811|  7749|1585110764|     Click|1.586272772315791E9|
+----------+------+----------+----------+-------------------+

--> Opps! It seems an Errrorrr with DB working! 'POSTGRES_HOST'
+----------+------+----------+----------+-------------------+
|content_id|epk_id|  event_ts|event_type|          insert_ts|
+----------+------+----------+----------+-------------------+
|      1872|  3955|1586269349|  Complete|1.586272775338701E9|
|       285|  1608|1586242337|     Click|1.586272775356217E9|
|        82|  3237|1585926578|     Click|1.586272775370792E9|
+----------+------+----------+----------+

## Clean up

In [None]:
ssc.stop()

In [8]:
!rm -rdf /tmp/spark/checkpoint_01

In [12]:
globals()['sparkSessionSingletonInstance'].stop()
del(globals()['sparkSessionSingletonInstance'])