# Init SparkContext

In [None]:
import os
from datetime import datetime
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as sf

In [None]:
spark = (SparkSession.builder.appName("tpch-benchmark-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)

# Load and create Catalog table

In [None]:
spark.sql("SHOW databases").show()


In [None]:
spark.sql("SHOW tables").show()

In [None]:
ls_files = [
    "h_customer.dsv",
    "h_lineitem.dsv",
    "h_nation.dsv",
    "h_order.dsv",
    "h_part.dsv",
    "h_partsupp.dsv",
    "h_region.csv",
    "h_supplier.dsv",
]

In [None]:
base_path = "s3a://warehouse/tpch_data"

for file in ls_files:
    file_name, ext = file.split(".")
    prefix, table_name = file_name.split("_")
    print("Loading", table_name)
    
    delimeter = "," if ext == "csv" else "|"
    df_data = (
        spark.read.option("delimiter", delimeter)
        .option("header", True)
        .option("inferSchema" , True)
        .csv(os.path.join(base_path, file))
    )
    
    # date convert
    for col in df_data.columns:
        if "date" in col.lower():
            df_data = df_data.withColumn(col, sf.date_sub(sf.to_date(col, "dd.MM.yy"), 365 * 100))
    
    # print info
    df_data.printSchema()
    display(df_data.limit(5).toPandas())
    
    # register temporary view
    df_data.createOrReplaceTempView(table_name)

In [None]:
spark.sql("SHOW TABLES").show()

# Benchmark

## 1. Group by, Order by

In [None]:
%%timeit -r 4

sql_stm = """
-- 1
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus;
"""
spark.sql(sql_stm).limit(20).toPandas()

## Compressed and repartition files

In [None]:
spark.sql("select * from lineitem").repartition(6).write.parquet("s3a://warehouse/outputs/lineitem")

In [None]:
# register temporary view
df_data = spark.read.parquet("s3a://warehouse/outputs/lineitem")
df_data.createOrReplaceTempView("lineitem_parts")

In [None]:
%%timeit -r 4

sql_stm = """
-- 1
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem_parts
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus;
"""
spark.sql(sql_stm).limit(20).toPandas()

## 2. Join, Group By, Order By

In [None]:
%%timeit -r 4

sql_stm = """
--3
select
	l_orderkey,
	sum(l_extendedprice * (1 - l_discount)) as revenue,
	o_orderdate,
	o_shippriority
from
	customer,
	order,
	lineitem
where
	c_mktsegment = 'AUTOMOBILE'
	and c_custkey = o_custkey
	and l_orderkey = o_orderkey
group by
	l_orderkey,
	o_orderdate,
	o_shippriority
order by
	revenue desc,
	o_orderdate
"""
spark.sql(sql_stm).show()

## 3. Sub queries, Join, Group by, Order by

In [None]:
%%timeit -r 4

sql_stm = """
select
	supp_nation,
	cust_nation,
	l_year,
	sum(volume) as revenue
from
	(
		select
			n1.n_name as supp_nation,
			n2.n_name as cust_nation,
			extract(year from l_shipdate) as l_year,
			l_extendedprice * (1 - l_discount) as volume
		from
			supplier,
			lineitem,
			order,
			customer,
			nation n1,
			nation n2
		where
			s_suppkey = l_suppkey
			and o_orderkey = l_orderkey
			and c_custkey = o_custkey
			and s_nationkey = n1.n_nationkey
			and c_nationkey = n2.n_nationkey
			and l_shipdate between date '1995-01-01' and date '1996-12-31'
	) as shipping
group by
	supp_nation,
	cust_nation,
	l_year
order by
	supp_nation,
	cust_nation,
	l_year;
"""
spark.sql(sql_stm).show()