# Init SparkContext

In [2]:
import os
from datetime import datetime
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as sf

In [3]:
spark = (SparkSession.builder.appName("tpch-benchmark-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)



# Load data

In [4]:
def convert_dates(df_data):
    for col in df_data.columns:
        if "date" in col.lower():
            df_data = df_data.withColumn(col, sf.date_sub(sf.to_date(col, "dd.MM.yy"), 365 * 100))
    return df_data

In [None]:
%%time
base_path = "s3a://warehouse/tpch_data"
df_lineitem = (
    spark.read.option("delimiter", "|")
    .option("header", True)
    .option("inferSchema" , True)
    .csv(os.path.join(base_path, "h_lineitem.dsv"))
)
df_lineitem = convert_dates(df_lineitem)

df_customer = (
    spark.read.option("delimiter", "|")
    .option("header", True)
    .option("inferSchema" , True)
    .csv(os.path.join(base_path, "h_customer.dsv"))
)
df_customer = convert_dates(df_customer)

df_order = (
    spark.read.option("delimiter", "|")
    .option("header", True)
    .option("inferSchema" , True)
    .csv(os.path.join(base_path, "h_order.dsv"))
)
df_order = convert_dates(df_order)

df_supplier = (
    spark.read.option("delimiter", "|")
    .option("header", True)
    .option("inferSchema" , True)
    .csv(os.path.join(base_path, "h_supplier.dsv"))
)
df_supplier = convert_dates(df_supplier)

df_nation = (
    spark.read.option("delimiter", "|")
    .option("header", True)
    .option("inferSchema" , True)
    .csv(os.path.join(base_path, "h_nation.dsv"))
)
df_nation = convert_dates(df_nation)

In [None]:
# phần này máy em bị crash không load nổi, nên các bài dưới cũng chưa chạy

# Benchmark

## 1. Group by, Order by

```sql
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus;
```    

In [None]:
# TODO: write your Pyspark DataFrame here...
sub = (
    df_lineitem.groupBy("l_returnflag", "l_linestatus")
.agg(sum("l_quantity").alias("sum_qty"),
sum("l_extendedprice").alias("sum_base_price"),
(sum("l_extendedprice") * (1 - "l_discount")).alias("sum_disc_price"),
(sum("l_extendedprice") * (1 - "l_discount") * (1 + "l_tax")).alias("sum_charge"),
avg("l_quantity").alias("avg_qty"),
avg("l_extendedprice").alias("avg_price"),
avg("l_discount").alias("avg_disc"),
count("*").alias("count_order"))
.orderBy("l_returnflag", "l_linestatus")
)
sub.show()

## 2. Join, Group By, Order By

```sql
select
	l_orderkey,
	sum(l_extendedprice * (1 - l_discount)) as revenue,
	o_orderdate,
	o_shippriority
from
	customer,
	orders,
	lineitem
where
	c_mktsegment = 'AUTOMOBILE'
	and c_custkey = o_custkey
	and l_orderkey = o_orderkey
group by
	l_orderkey,
	o_orderdate,
	o_shippriority
order by
	revenue desc,
	o_orderdate
```    

In [None]:
# TODO: write your Pyspark DataFrame here...
df = df_lineitem.join(df_orders, df_lineitem.l_orderkey == df_orders.o_orderkey)\
                 .join(df_customer, df_customer.c_custkey == df_orders.o_custkey)\
                 .filter(df_customer.c_mktsegment == 'AUTOMOBILE')\
                 .groupby(df_lineitem.l_orderkey, df_orders.o_orderdate, df_orders.o_shippriority)\
                 .agg(sum((df_lineitem.l_extendedprice * (1 - df_lineitem.l_discount))).alias("revenue"))\
                 .orderBy(['revenue', 'o_orderdate'], ascending=[False, True])
df.show()

## 3. Sub queries, Join, Group by, Order by

```sql
select
	supp_nation,
	cust_nation,
	l_year,
	sum(volume) as revenue
from
	(
		select
			n1.n_name as supp_nation,
			n2.n_name as cust_nation,
			extract(year from l_shipdate) as l_year,
			l_extendedprice * (1 - l_discount) as volume
		from
			supplier,
			lineitem,
			orders,
			customer,
			nation n1,
			nation n2
		where
			s_suppkey = l_suppkey
			and o_orderkey = l_orderkey
			and c_custkey = o_custkey
			and s_nationkey = n1.n_nationkey
			and c_nationkey = n2.n_nationkey
			and l_shipdate between date '1995-01-01' and date '1996-12-31'
	) as shipping
group by
	supp_nation,
	cust_nation,
	l_year
order by
	supp_nation,
	cust_nation,
	l_year;
```    

In [None]:
# TODO: write your Pyspark DataFrame here...

In [None]:
df = df_supplier.join(df_lineitem, df_supplier.SUPPKEY == df_lineitem.SUPPKEY) \
    .join(df_orders, df_lineitem.ORDERKEY == df_orders.ORDERKEY) \
    .join(df_customer, df_orders.CUSTKEY == df_customer.CUSTKEY) \
    .join(df_nation.alias("n1"), df_supplier.NATIONKEY == df_nation.NATIONKEY) \
    .join(df_nation.alias("n2"), df_customer.NATIONKEY == df_nation.NATIONKEY) \
    .filter((df_lineitem.SHIPDATE >= "1995-01-01") & (df_lineitem.SHIPDATE <= "1996-12-31")) \
    .filter(df_customer.MKTSEGMENT == "AUTOMOBILE") \
    .selectExpr("n1.N_NAME as supp_nation", "n2.N_NAME as cust_nation", "l_extendedprice * (1 - l_discount) as volume", "l_shipdate") \
    .withColumn("l_year", year(df_lineitem.SHIPDATE.cast(DateType())))

# group by
df_revenue = df.groupBy("supp_nation", "cust_nation", "l_year") \
    .agg(sum("volume").alias("revenue")) \
    .orderBy("supp_nation", "cust_nation", "l_year")

df_revenue.show()