# Init SparkContext

In [2]:
import os
from datetime import datetime
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as sf

In [3]:
spark = (SparkSession.builder.appName("tpch-benchmark-{}".format(datetime.today()))
        .master("spark://spark-master:7077")
        .enableHiveSupport()
        .getOrCreate())

sqlContext = SQLContext(spark)



# Load and create Catalog table

In [4]:
ls_files = [
    "h_customer.dsv",
    "h_lineitem.dsv",
    "h_nation.dsv",
    "h_order.dsv",
    "h_part.dsv",
    "h_partsupp.dsv",
    "h_region.csv",
    "h_supplier.dsv",
]

In [None]:
# base_path = "s3a://warehouse/tpch_data"

# for file in ls_files:
#     file_name, ext = file.split(".")
#     prefix, table_name = file_name.split("_")
#     print("Loading", table_name)
    
#     delimeter = "," if ext == "csv" else "|"
#     df_data = (
#         spark.read.option("delimiter", delimeter)
#         .option("header", True)
#         .option("inferSchema" , True)
#         .csv(os.path.join(base_path, file))
#     )
    
#     # date convert
#     for col in df_data.columns:
#         if "date" in col.lower():
#             df_data = df_data.withColumn(col, sf.date_sub(sf.to_date(col, "dd.MM.yy"), 365 * 100))
    
#     # print info
#     df_data.printSchema()
#     display(df_data.limit(5).toPandas())
    
#     # register temporary view
#     df_data.createOrReplaceTempView(table_name)

In [16]:
TEMPLATE_DDL_EXT_PARQUET = \
"""
    CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{table_name} ({columns_spec})
    STORED AS PARQUET
    LOCATION '{table_location}'
"""

def get_column_spec(field):
    return f"{field.name} {field.dataType.simpleString()}"


base_path = "s3a://warehouse/tpch_data"

for file in ls_files:
    file_name, ext = file.split(".")
    prefix, table_name = file_name.split("_")
    print("Loading", table_name)
    
    delimeter = "," if ext == "csv" else "|"
    df_data = (
        spark.read.option("delimiter", delimeter)
        .option("header", True)
        .option("inferSchema" , True)
        .csv(os.path.join(base_path, file))
    )
    
    # date convert
    for col in df_data.columns:
        if "date" in col.lower():
            df_data = df_data.withColumn(col, sf.date_sub(sf.to_date(col, "dd.MM.yy"), 365 * 100))
    
    table_name = file.split(".")[0]
    columns_spec = ','.join([get_column_spec(f) for f in df_data.schema])
    table_location = os.path.join("s3a://warehouse/", "parquet", file)
    df_data.repartition(10).write.parquet(table_location, mode="overwrite")

    #
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    ddl = TEMPLATE_DDL_EXT_PARQUET.format(
        db_name = "default",
        table_name = table_name,
        columns_spec = columns_spec,
        table_location = table_location
    )
    
    #
    spark.sql(ddl)


Loading customer
Loading lineitem
Loading nation
Loading order
Loading part
Loading partsupp
Loading region
Loading supplier


In [5]:
spark.sql("SHOW TABLES").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|h_customer|      false|
|  default|h_lineitem|      false|
|  default|  h_nation|      false|
|  default|   h_order|      false|
|  default|    h_part|      false|
|  default|h_partsupp|      false|
|  default|  h_region|      false|
|  default|h_supplier|      false|
+---------+----------+-----------+



In [6]:
spark.sql("SELECT * FROM h_lineitem LIMIT 10").toPandas()

Unnamed: 0,L_ORDERKEY,L_PARTKEY,L_SUPPKEY,L_LINENUMBER,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT,L_TAX,L_RETURNFLAG,L_LINESTATUS,L_SHIPDATE,L_COMMITDATE,L_RECEIPTDATE,L_SHIPINSTRUCT,L_SHIPMODE,L_COMMENT
0,2863079,184387,9397,4,12,1765656,1,8,A,F,1992-05-22,1992-05-30,1992-06-20,NONE,FOB,ironic requests thrash fluf
1,11287590,19613,4614,2,32,4904352,0,3,A,F,1992-05-18,1992-03-20,1992-06-07,NONE,AIR,carefully silent foxes cajo
2,5499459,386009,6010,3,19,2080481,5,5,A,F,1992-07-02,1992-07-27,1992-07-13,TAKE BACK RETURN,FOB,packages about sleep furiou
3,6283910,214559,14560,2,19,2799726,9,3,A,F,1992-07-03,1992-05-29,1992-07-23,NONE,TRUCK,final requests haggle caref
4,8733858,168897,8898,1,40,786356,6,3,A,F,1992-07-19,1992-06-06,1992-08-04,DELIVER IN PERSON,SHIP,carefully even ideas affix
5,4850945,319185,4201,1,28,3371676,1,5,A,F,1992-06-25,1992-05-04,1992-07-24,DELIVER IN PERSON,FOB,"final, fluffy deposits hagg"
6,7341382,266304,1344,1,14,1778406,5,6,A,F,1992-04-30,1992-06-04,1992-05-18,NONE,AIR,blithely regular instructio
7,3124741,149444,4466,1,34,5077696,3,8,R,F,1992-07-21,1992-06-21,1992-08-16,TAKE BACK RETURN,REG AIR,ironically express realms n
8,3807427,251149,16162,3,12,1320156,1,6,R,F,1992-08-06,1992-10-14,1992-08-23,COLLECT COD,AIR,pending accounts according
9,465445,163058,3059,1,40,44842,6,2,R,F,1992-06-25,1992-05-14,1992-07-10,DELIVER IN PERSON,SHIP,silent requests affix quick


In [8]:
spark.sql("""
    SELECT 
        L_ORDERKEY
        , L_LINENUMBER
        , L_QUANTITY
        , L_SHIPMODE
        , L_COMMENT 
    FROM h_lineitem 
    LIMIT 10
""").show()

+----------+------------+----------+----------+--------------------+
|L_ORDERKEY|L_LINENUMBER|L_QUANTITY|L_SHIPMODE|           L_COMMENT|
+----------+------------+----------+----------+--------------------+
|     20036|           3|        13|RAIL      |blithely even the...|
|    106913|           2|        36|SHIP      |bold requests abo...|
|   8885987|           2|        11|MAIL      |final, bold depos...|
|   7493224|           2|        46|TRUCK     |quickly ironic ex...|
|   8968487|           3|        39|SHIP      |carefully pending...|
|   8206563|           2|        38|AIR       |fluffily silent a...|
|   9571494|           4|        50|REG AIR   |express requests ...|
|   9095781|           1|         7|TRUCK     |fluffily careful ...|
|   8077670|           5|         3|REG AIR   |busily regular sh...|
|   5041671|           2|         2|RAIL      |packages about ha...|
+----------+------------+----------+----------+--------------------+



# Benchmark

## 1. Group by, Order by

In [None]:
%%timeit -r 4

sql_stm = """
-- 1
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus;
"""
spark.sql(sql_stm).limit(20).toPandas()

## Compressed and repartition files

In [None]:
spark.sql("select * from lineitem").repartition(6).write.parquet("s3a://warehouse/outputs/lineitem")

In [None]:
# register temporary view
df_data = spark.read.parquet("s3a://warehouse/outputs/lineitem")
df_data.createOrReplaceTempView("lineitem_parts")

In [None]:
%%timeit -r 4

sql_stm = """
-- 1
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem_parts
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus;
"""
spark.sql(sql_stm).limit(20).toPandas()

## 2. Join, Group By, Order By

In [None]:
%%timeit -r 4

sql_stm = """
--3
select
	l_orderkey,
	sum(l_extendedprice * (1 - l_discount)) as revenue,
	o_orderdate,
	o_shippriority
from
	customer,
	order,
	lineitem
where
	c_mktsegment = 'AUTOMOBILE'
	and c_custkey = o_custkey
	and l_orderkey = o_orderkey
group by
	l_orderkey,
	o_orderdate,
	o_shippriority
order by
	revenue desc,
	o_orderdate
"""
spark.sql(sql_stm).show()

## 3. Sub queries, Join, Group by, Order by

In [None]:
%%timeit -r 4

sql_stm = """
select
	supp_nation,
	cust_nation,
	l_year,
	sum(volume) as revenue
from
	(
		select
			n1.n_name as supp_nation,
			n2.n_name as cust_nation,
			extract(year from l_shipdate) as l_year,
			l_extendedprice * (1 - l_discount) as volume
		from
			supplier,
			lineitem,
			order,
			customer,
			nation n1,
			nation n2
		where
			s_suppkey = l_suppkey
			and o_orderkey = l_orderkey
			and c_custkey = o_custkey
			and s_nationkey = n1.n_nationkey
			and c_nationkey = n2.n_nationkey
			and l_shipdate between date '1995-01-01' and date '1996-12-31'
	) as shipping
group by
	supp_nation,
	cust_nation,
	l_year
order by
	supp_nation,
	cust_nation,
	l_year;
"""
spark.sql(sql_stm).show()