src/main/scala/io/snappydata/benchmark/KafkaSnappyIngestionPerf.scala

/*
 * Copyright (c) 2016 SnappyData, Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License. See accompanying
 * LICENSE file.
 */

package io.snappydata.benchmark

import io.snappydata.adanalytics.Configs._
import org.apache.spark.SparkContext
import org.apache.spark.streaming.SnappyStreamingContext

/**
  * Simple direct kafka spark streaming program which pulls log messages
  * from kafka broker and ingest those log messages to Snappy store.
  */
object KafkaSnappyIngestionPerf extends App {

  val sparkConf = new org.apache.spark.SparkConf()
    .setAppName(getClass.getSimpleName)
    .set("spark.sql.inMemoryColumnarStorage.compressed", "false")
    .set("spark.sql.inMemoryColumnarStorage.batchSize", "2000")
    .set("spark.streaming.kafka.maxRatePerPartition" , s"$maxRatePerPartition")
    //.setMaster("local[*]")
    .setMaster(s"$snappyMasterURL")

  val assemblyJar = System.getenv("PROJECT_ASSEMBLY_JAR")
  if (assemblyJar != null) {
    sparkConf.set("spark.driver.extraClassPath", assemblyJar)
    sparkConf.set("spark.executor.extraClassPath", assemblyJar)
  }

  sparkConf.set("spark.driver.extraJavaOptions", "-Dgemfire.tombstone-gc-threshold=5000")
  sparkConf.set("spark.executor.extraJavaOptions", "-Dgemfire.tombstone-gc-threshold=5000")

  val sc = new SparkContext(sparkConf)
  val snsc = new SnappyStreamingContext(sc, batchDuration)

  snsc.sql("drop table if exists adImpressions")
  snsc.sql("drop table if exists adImpressionStream")

  // Create a stream of AdImpressionLog which will pull the log messages
  // from Kafka broker
  snsc.sql("create stream table adImpressionStream (" +
    " time_stamp timestamp," +
    " publisher string," +
    " advertiser string," +
    " website string," +
    " geo string," +
    " bid double," +
    " cookie string) " +
    " using directkafka_stream options (" +
    " rowConverter 'io.snappydata.adanalytics.AdImpressionToRowsConverter' ," +
    s" kafkaParams 'metadata.broker.list->$brokerList'," +
    s" topics '$kafkaTopic'," +
    " K 'java.lang.String'," +
    " V 'io.snappydata.adanalytics.AdImpressionLog', " +
    " KD 'kafka.serializer.StringDecoder', " +
    " VD 'io.snappydata.adanalytics.AdImpressionLogAvroDecoder')")

  snsc.sql("create table adImpressions(times_tamp timestamp, publisher string, " +
    "advertiser string, website string, geo string, bid double, cookie string) " +
    "using column " +
    "options ( buckets '29', persistent 'asynchronous')")

  // Save the streaming data to snappy store per second (btachDuration)
  snsc.getSchemaDStream("adImpressionStream")
    .foreachDataFrame(_.write.insertInto("adImpressions"))

  snsc.start
  snsc.awaitTermination
}