# Data Processing

## Data Processing using Scala

 * To do quarterly event reporting comparing events between data export(Json) from Holocron and event data available in Legal and Marketing (csv)
 * Output is required in Parquet for further processing


In [None]:
## data processing
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{date_format, sum, round}

val username = System.getProperty("user.name")

val spark = SparkSession.
    builder.
    config("spark.ui.port", "0").
    appName("event_reporting_quarterly").
    master("yarn").
    getOrCreate

spark.conf.set("spark.sql.shuffle.partitions", "2")
import spark.implicits._

## Reading events from reporting team
val events_tno = "path/file"
val events = spark.
    read.
    schema("event_id INT, event_date TIMESTAMP, " +
           "event_reference_id INT, event_status STRING, event_payment_status STRING, event_refund FLOAT"
          ).
    csv(events_tno)
    
## Reading events from holocron 
val events_hc = spark. \
    read. \
    option("inferSchema", "false").
    schema("""event_id INT, event_date TIMESTAMP,
              event_reference_id INT, event_status STRING
           """).
    format("json").
    load("filepath/file")

events.
    filter("event_status in ('COMPLETE', 'CLOSED')").
    join(events_hc, events("event_reference_id") === events_hc("event_reference_id")).
    groupBy(date_format($"event_date", "yyyyMM").alias("event_month")).
    agg(round(sum($"event_refund"), 2).alias("total_refund")).
    orderBy("event_month").
    coalesce(1).
    write.
    mode("overwrite").
    option("compression", "none").
    format("parquet").
    save("output_file")