In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F


In [4]:
spark = SparkSession\
        .builder\
        .config("spark.driver.memory", "10g")\
        .appName("BucketingBy")\
        .master("local[*]")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 23:16:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [6]:
orders_file = "/Users/sateeshreddypatlolla/Downloads/SQL/CSV/Mini_Project_2/Order_data.csv"
df_orders = spark.read.csv(orders_file, header=True, inferSchema=True)


[Stage 1:>                                                          (0 + 4) / 4]

                                                                                

In [7]:
df_orders.show(5, False)
df_orders.printSchema()

+-------+----------+---------+----------+---------------+
|OrderID|CustomerID|ProductID|OrderDate |QuantityOrdered|
+-------+----------+---------+----------+---------------+
|1      |1         |68       |2012-08-14|1              |
|2      |1         |22       |2012-08-14|6              |
|3      |1         |66       |2012-08-14|4              |
|4      |1         |42       |2012-08-15|6              |
|5      |1         |53       |2012-08-15|2              |
+-------+----------+---------+----------+---------------+
only showing top 5 rows

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- QuantityOrdered: integer (nullable = true)



In [8]:
products_file = "/Users/sateeshreddypatlolla/Downloads/SQL/CSV/Mini_Project_2/Product_data.csv"
df_products = spark.read.csv(products_file, header=True, inferSchema=True)

In [9]:
df_products.show(5, False)
df_products.printSchema()

+---------+-----------------+----------------+-----------------+
|ProductID|ProductName      |ProductUnitPrice|ProductCategoryID|
+---------+-----------------+----------------+-----------------+
|1        |Alice Mutton     |39.0            |6                |
|2        |Aniseed Syrup    |10.0            |2                |
|3        |Boston Crab Meat |18.4            |8                |
|4        |Camembert Pierrot|34.0            |4                |
|5        |Carnarvon Tigers |62.5            |8                |
+---------+-----------------+----------------+-----------------+
only showing top 5 rows

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- ProductUnitPrice: double (nullable = true)
 |-- ProductCategoryID: integer (nullable = true)



In [10]:
df_products.select("ProductID").distinct().count()

77

In [11]:
df_orders.select("OrderID").distinct().count()

621806

In [12]:
df_orders_product_details = (
    df_orders.join(
        df_products,
        on="ProductID",
        how="inner"
    )
)

In [13]:
df_orders_product_details.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [ProductID#19, OrderID#17, CustomerID#18, OrderDate#20, QuantityOrdered#21, ProductName#72, ProductUnitPrice#73, ProductCategoryID#74]
   +- SortMergeJoin [ProductID#19], [ProductID#71], Inner
      :- Sort [ProductID#19 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(ProductID#19, 200), ENSURE_REQUIREMENTS, [plan_id=259]
      :     +- Filter isnotnull(ProductID#19)
      :        +- FileScan csv [OrderID#17,CustomerID#18,ProductID#19,OrderDate#20,QuantityOrdered#21] Batched: false, DataFilters: [isnotnull(ProductID#19)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/sateeshreddypatlolla/Downloads/SQL/CSV/Mini_Project_2/Orde..., PartitionFilters: [], PushedFilters: [IsNotNull(ProductID)], ReadSchema: struct<OrderID:int,CustomerID:int,ProductID:int,OrderDate:date,QuantityOrdered:int>
      +- Sort [ProductID#71 ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(ProductID#71

In [14]:
df_orders_product_details.count()

621806

In [20]:
(
    df_orders
    .write.bucketBy(25, col="ProductID")
    .mode("overwrite")
    .saveAsTable("orders_bucketed")
)

                                                                                

In [21]:
(
    df_products
    .write.bucketBy(10, "ProductID")
    .mode("overwrite")
    .saveAsTable("products_bucketed")
)

In [22]:
df_orders_bucketed = spark.table("orders_bucketed")
df_products_bucketed = spark.table("products_bucketed")

In [24]:
df_orders_product_details_bucketed = (
    df_orders_bucketed.join(
        df_products_bucketed,
        on="ProductID",
        how="inner"
    )
)

In [25]:
df_orders_product_details_bucketed.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [ProductID#206, OrderID#204, CustomerID#205, OrderDate#207, QuantityOrdered#208, ProductName#215, ProductUnitPrice#216, ProductCategoryID#217]
   +- SortMergeJoin [ProductID#206], [ProductID#214], Inner
      :- Sort [ProductID#206 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(ProductID#206)
      :     +- FileScan parquet spark_catalog.default.orders_bucketed[OrderID#204,CustomerID#205,ProductID#206,OrderDate#207,QuantityOrdered#208] Batched: true, Bucketed: true, DataFilters: [isnotnull(ProductID#206)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/sateeshreddypatlolla/Downloads/SparkCourse/spark-warehouse..., PartitionFilters: [], PushedFilters: [IsNotNull(ProductID)], ReadSchema: struct<OrderID:int,CustomerID:int,ProductID:int,OrderDate:date,QuantityOrdered:int>, SelectedBucketsCount: 25 out of 25
      +- Sort [ProductID#214 ASC NULLS FIRST], false, 0
         +- Exchange hashpartition

In [26]:
df_orders_product_details_bucketed.count()

621806