# Log File - Data Transformation and Ingestion

### Loading Libraries
##### Spark Session, Dataframe Functions, Data types and Json

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

### Variables Initialization

In [None]:
cassandra_host = "cassandra"
cassandra_user = "cassandra"
cassandra_pwd  = "cassandra"
cassandra_port = 9042
key_space      = "LogAnalysis"
table_name     = "NASALog"
kafka_server   = "kafka:9092"
kafka_topic    = "nasa_logs_demo"

### Spark Session
##### Spark Session object creation with configuration data stax spark-cassandra connector and cassandra related connectivity credentials.

In [None]:
#Spark Session creation configured to interact with MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector-driver_2.12:3.0.0").\
config("spark.cassandra.connection.host",cassandra_host).\
config("spark.cassandra.auth.username",cassandra_user).\
config("spark.cassandra.auth.password",cassandra_pwd).\
getOrCreate()

### Get data from Kafka with Schema
##### Read data from Kafka topic vai Spark structured streaming API by providing Kafka server and Topic details.

In [None]:
#Read data from Kafka topic
split_logic = split(col("url"),"\.").getItem(1)
log_data = spark\
  .readStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers",kafka_server)\
  .option("subscribe", kafka_topic)\
  .option("startingOffsets", "earliest")\
  .load()\
  .selectExpr("split(value,',')[1] as host",
              "split(value,',')[2] as time",
              "split(value,',')[3] as method",
              "split(value,',')[4] as url",
              "split(value,',')[5] as response",
              "split(value,',')[6] as bytes"
             )\
  .withColumn("time_added",unix_timestamp())\
  .withColumn("extension",when(split_logic.isNull(),"None").otherwise(split_logic))

### Foreach Batch method
##### This method be called from Spark froeachBatch sink and writes to cassandra database. It takes micro batch(dataframe) and its the unique id as input.

In [None]:
def process_row(df, epoch_id):
    """Writes data to Cassandra and HDFS location

    Parameters
    ----------
    df : DataFrame
        Streaming Dataframe
    epoch_id : int
        Unique id for each micro batch/epoch
    """
    df.write\
    .format("org.apache.spark.sql.cassandra")\
    .mode('append')\
    .options(table="nasalog", keyspace="loganalysis")\
    .save() #hot path
    df.write.csv("hdfs://namenode:8020/output/nasa_logs/",mode="append") #cold path

### Cassandra Sink
##### Writes stream of delta data to Cassandra using foreachBatch sink continuosly until an interruption occurs. Stores processed indices at a checkpoint location so that it will not process the messages already processed.

In [None]:
#Writes streaming dataframe to ForeachBatch console which ingests data to Cassandra
log_data \
    .writeStream \
    .option("checkpointLocation", "checkpoint/data") \
    .foreachBatch(process_row) \
    .start() \
    .awaitTermination()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-09b07ac03c09>", line 5, in <module>
    .foreachBatch(process_row) \
  File "/usr/local/lib/python3.7/dist-packages/pyspark/sql/streaming.py", line 103, in awaitTermination
    return self._jsq.awaitTermination()
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1303, in __call__
    answer = self.gateway_client.send_command(command)
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py", line 1200, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During h


KeyboardInterrupt

