# 03 SQL Hints

In [0]:
from pyspark.sql.functions import *
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
# Big DataFrame
df_transactions = spark.createDataFrame([
    (1, "US", 100),
    (2, "IN", 200),
    (3, "UK", 150),
    (4, "US", 80),
], ["id", "country_code", "amount"])

# Small DataFrame
df_countries = spark.createDataFrame([
    ("US", "United States"),
    ("IN", "India"),
    ("UK", "United Kingdom"),
], ["country_code", "country_name"])

In [0]:
df_transactions.createOrReplaceTempView("transactions")
df_countries.createOrReplaceTempView("countries")

Let us perform the usual join

In [0]:
(spark.sql(
        """SELECT 
        *
        FROM transactions t
        INNER JOIN 
        countries c ON t.country_code = c.country_code""")
        .display())

id,country_code,amount,country_code.1,country_name
1,US,100,US,United States
4,US,80,US,United States
2,IN,200,IN,India
3,UK,150,UK,United Kingdom


Again we see the 200 partitions caused by the join. How do we perform a broadcast using SQL?


## SQL hint to broadcast

The hint is just an advice. That means that spark will evaluate whether apply the broadcast or not.

In [0]:
df_sql_opt = spark.sql(
    '''SELECT /* broadcast(c) */ 
        * 
FROM transactions t 
JOIN countries c 
ON t.country_code = c.country_code''')

In [0]:
df_sql_opt.display()

id,country_code,amount,country_code.1,country_name
1,US,100,US,United States
4,US,80,US,United States
2,IN,200,IN,India
3,UK,150,UK,United Kingdom
