In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import SparkConf
from pyspark.storagelevel import StorageLevel

In [2]:
spark = SparkSession.builder\
        .appName('spark-joins')\
        .config("spark.sql.autoBroadcastJoinThreshold","-1")\
        .config("spark.dynamicAllocation.enabled",'true')\
        .config("spark.executor.memory",'2g')\
        .config("spark.executor.cores",'2')\
        .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/06 07:02:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
states = {0: "AZ", 1: "CO", 2: "CA", 3: "TX", 4: "NY", 5: "MI"}
items = {0: "SKU-0", 1: "SKU-1", 2: "SKU-2", 3: "SKU-3", 4: "SKU-4", 5: "SKU-5"}
np.random.seed(42)

In [4]:
users_data = [(id, f"user_{id}", f"user_{id}@databricks.com",
                   states[np.random.randint(0, 5)]) for id in range(100001)]
orders_data = [(r, r, np.random.randint(0, 100000), 10 * r * 0.2, 
                    states[np.random.randint(0, 5)], items[np.random.randint(0, 5)]) for r in range(100001)]


In [5]:
orders_data[:4]

[(0, 0, 75885, 0.0, 'AZ', 'SKU-2'),
 (1, 1, 69179, 2.0, 'TX', 'SKU-2'),
 (2, 2, 25773, 4.0, 'AZ', 'SKU-3'),
 (3, 3, 12681, 6.0, 'CA', 'SKU-3')]

In [6]:
users_data[:4]

[(0, 'user_0', 'user_0@databricks.com', 'TX'),
 (1, 'user_1', 'user_1@databricks.com', 'NY'),
 (2, 'user_2', 'user_2@databricks.com', 'CA'),
 (3, 'user_3', 'user_3@databricks.com', 'NY')]

In [7]:
schema_1=" uid int,login string,email string,user_state string   "
schema_2 = "transaction_id int,quantity int,user_id int,amount float,state string,items string"

In [8]:
usersDf = spark.createDataFrame(users_data,schema_1)

In [9]:
usersDf.show(3,False)

24/06/06 07:02:42 WARN TaskSetManager: Stage 0 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+---+------+---------------------+----------+
|uid|login |email                |user_state|
+---+------+---------------------+----------+
|0  |user_0|user_0@databricks.com|TX        |
|1  |user_1|user_1@databricks.com|NY        |
|2  |user_2|user_2@databricks.com|CA        |
+---+------+---------------------+----------+
only showing top 3 rows



In [10]:
usersDf.persist(StorageLevel.DISK_ONLY)

DataFrame[uid: int, login: string, email: string, user_state: string]

In [11]:
ordersDf = spark.createDataFrame(orders_data,schema_2)

In [12]:
ordersDf.show(2,False)

+--------------+--------+-------+------+-----+-----+
|transaction_id|quantity|user_id|amount|state|items|
+--------------+--------+-------+------+-----+-----+
|0             |0       |75885  |0.0   |AZ   |SKU-2|
|1             |1       |69179  |2.0   |TX   |SKU-2|
+--------------+--------+-------+------+-----+-----+
only showing top 2 rows



In [13]:
spark.conf.set('spark.sql.join.preferSortMergeJoin','true')

In [15]:
ordersDf.persist(StorageLevel.DISK_ONLY)

DataFrame[transaction_id: int, quantity: int, user_id: int, amount: float, state: string, items: string]

In [17]:
ordersDf.count()

                                                                                

100001

In [18]:
usersDf.count()

24/06/06 07:26:01 WARN TaskSetManager: Stage 6 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
24/06/06 07:26:02 WARN TaskSetManager: Stage 7 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100001

In [19]:
usersDf.select(count(col('uid'))).show()

24/06/06 07:27:29 WARN TaskSetManager: Stage 10 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.


+----------+
|count(uid)|
+----------+
|    100001|
+----------+



In [21]:
ordersDf.orderBy('user_id').show(3)

+--------------+--------+-------+--------+-----+-----+
|transaction_id|quantity|user_id|  amount|state|items|
+--------------+--------+-------+--------+-----+-----+
|         58100|   58100|      0|116200.0|   CA|SKU-1|
|         19229|   19229|      1| 38458.0|   NY|SKU-1|
|         81594|   81594|      1|163188.0|   TX|SKU-1|
+--------------+--------+-------+--------+-----+-----+
only showing top 3 rows



In [22]:
user_orderDf = usersDf.join(ordersDf,usersDf.uid==ordersDf.user_id)

In [23]:
usersDf.show(2)

+---+------+--------------------+----------+
|uid| login|               email|user_state|
+---+------+--------------------+----------+
|  0|user_0|user_0@databricks...|        TX|
|  1|user_1|user_1@databricks...|        NY|
+---+------+--------------------+----------+
only showing top 2 rows



24/06/06 07:36:03 WARN TaskSetManager: Stage 14 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.


In [24]:
user_orderDf.count()

24/06/06 07:36:23 WARN TaskSetManager: Stage 15 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

100001

In [26]:
user_orderDf.show(2,False)

24/06/06 07:36:59 WARN TaskSetManager: Stage 24 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
[Stage 25:>                                                         (0 + 4) / 4]

+---+------+---------------------+----------+--------------+--------+-------+--------+-----+-----+
|uid|login |email                |user_state|transaction_id|quantity|user_id|amount  |state|items|
+---+------+---------------------+----------+--------------+--------+-------+--------+-----+-----+
|1  |user_1|user_1@databricks.com|NY        |19229         |19229   |1      |38458.0 |NY   |SKU-1|
|1  |user_1|user_1@databricks.com|NY        |81594         |81594   |1      |163188.0|TX   |SKU-1|
+---+------+---------------------+----------+--------------+--------+-------+--------+-----+-----+
only showing top 2 rows



                                                                                

In [27]:
user_orderDf.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [uid#0], [user_id#47], Inner
   :- Sort [uid#0 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(uid#0, 200), ENSURE_REQUIREMENTS, [plan_id=671]
   :     +- Filter isnotnull(uid#0)
   :        +- InMemoryTableScan [uid#0, login#1, email#2, user_state#3], [isnotnull(uid#0)]
   :              +- InMemoryRelation [uid#0, login#1, email#2, user_state#3], StorageLevel(disk, 1 replicas)
   :                    +- *(1) Scan ExistingRDD[uid#0,login#1,email#2,user_state#3]
   +- Sort [user_id#47 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(user_id#47, 200), ENSURE_REQUIREMENTS, [plan_id=672]
         +- Filter isnotnull(user_id#47)
            +- InMemoryTableScan [transaction_id#45, quantity#46, user_id#47, amount#48, state#49, items#50], [isnotnull(user_id#47)]
                  +- InMemoryRelation [transaction_id#45, quantity#46, user_id#47, amount#48, state#49, items#50], StorageLevel(disk,

In [28]:
ordersDf.orderBy('user_id').write\
        .format('parquet')\
        .bucketBy(8,'user_id')\
        .saveAsTable('order_tbl',mode='overwrite')

                                                                                

In [30]:
usersDf.orderBy('uid').write\
        .format('parquet')\
        .bucketBy(8,'uid')\
        .saveAsTable('user_tbl',mode='overwrite')

24/06/06 08:15:16 WARN TaskSetManager: Stage 33 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
24/06/06 08:15:16 WARN TaskSetManager: Stage 34 contains a task of very large size (1162 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [31]:
spark.sql("CACHE TABLE User_Tbl")
spark.sql("CACHE TABLE Order_Tbl")

                                                                                

DataFrame[]

In [32]:
orders_df = spark.table('order_tbl')
users_df = spark.table('user_tbl')

In [33]:
joinur = orders_df.join(users_df,orders_df.user_id==users_df.uid)

In [34]:
joinur.show(3,False)

+--------------+--------+-------+--------+-----+-----+---+-------+----------------------+----------+
|transaction_id|quantity|user_id|amount  |state|items|uid|login  |email                 |user_state|
+--------------+--------+-------+--------+-----+-----+---+-------+----------------------+----------+
|57833         |57833   |13     |115666.0|NY   |SKU-2|13 |user_13|user_13@databricks.com|CO        |
|33546         |33546   |38     |67092.0 |TX   |SKU-3|38 |user_38|user_38@databricks.com|AZ        |
|81612         |81612   |70     |163224.0|AZ   |SKU-4|70 |user_70|user_70@databricks.com|AZ        |
+--------------+--------+-------+--------+-----+-----+---+-------+----------------------+----------+
only showing top 3 rows



In [35]:
joinur.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [user_id#1860], [uid#1721], Inner
   :- Sort [user_id#1860 ASC NULLS FIRST], false, 0
   :  +- Filter isnotnull(user_id#1860)
   :     +- Scan In-memory table Order_Tbl [transaction_id#1858, quantity#1859, user_id#1860, amount#1861, state#1862, items#1863], [isnotnull(user_id#1860)]
   :           +- InMemoryRelation [transaction_id#1858, quantity#1859, user_id#1860, amount#1861, state#1862, items#1863], StorageLevel(disk, memory, deserialized, 1 replicas)
   :                 +- *(1) ColumnarToRow
   :                    +- FileScan parquet spark_catalog.default.order_tbl[transaction_id#1858,quantity#1859,user_id#1860,amount#1861,state#1862,items#1863] Batched: true, Bucketed: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/xwyang/spark-warehouse/order_tbl], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<transaction_id:int,quantity:int,user_id:int,amount:float

In [36]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|order_tbl|      false|
|  default| user_tbl|      false|
+---------+---------+-----------+



In [37]:
spark.sql('describe table user_tbl')

DataFrame[col_name: string, data_type: string, comment: string]

In [42]:
spark.sql(""" create or replace temp view 
fly_tbl using csv options(path '/Users/xwyang/Desktop/data/flight_delays.csv')  """ )

DataFrame[]

In [41]:
dff = spark.read.csv('/Users/xwyang/Desktop/data/flight_delays.csv')

In [43]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|order_tbl|      false|
|  default| user_tbl|      false|
|         |  fly_tbl|       true|
+---------+---------+-----------+

