In [0]:
# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Spark SQL")
        .master("local[*]")
        .getOrCreate()
)
spark


What is Catalog? Catalog stores metadata and metadata (Columns, datatype, comments and other metadata related to the tables/views created)

In [0]:
# Spark Catalog (Metadata) - in-memory/hive
spark.conf.get("spark.sql.catalogImplementation") # hive is the default here

# for in-memory we have to mention it explicity like below:
spark.conf.set("spark.sql.catalogImplementation","in-memory")
spark.conf.get("spark.sql.catalogImplementation")


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-544566587118084>:5[0m
[1;32m      1[0m [38;5;66;03m# Spark Catalog (Metadata) - in-memory/hive[39;00m
[1;32m      2[0m [38;5;66;03m# spark.conf.get("spark.sql.catalogImplementation") # hive is the default here[39;00m
[1;32m      3[0m 
[1;32m      4[0m [38;5;66;03m# for in-memory we have to mention it explicity like below:[39;00m
[0;32m----> 5[0m spark[38;5;241m.[39mconf[38;5;241m.[39mset([38;5;124m"[39m[38;5;124mspark.sql.catalogImplementation[39m[38;5;124m"[39m,[38;5;124m"[39m[38;5;124min-memory[39m[38;5;124m"[39m)
[1;32m      6[0m spark[38;5;241m.[39mconf[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mspark.sql.catalogImplementation[39m[38;5;124m"[39m)

File [0;32m/databricks/spark/python/pyspark/sql/conf.py:40[0m, in [0;36mRuntimeConfig.set[0;34m(

**You see the above error because:**
- Spark configurations, especially those related to catalog implementation, are immutable after SparkSession has been initialized. 
- spark.sql.catalogImplementation is a static configuration, meaning it must be set before SparkSession is created.
- Trying to change it after the SparkSession is initialized (via spark.conf.set()) will have no effect or may throw an error.

In [0]:
spark.conf.get("spark.sql.catalogImplementation")

Out[9]: 'hive'

> IMPORTANT:
- Using '*in-memory catalog implementation*' is not safe because if the Spark Session is lost or restarted, all the previous tables (views will be lost in any session because it is temporary) created will be lost!
- To avoid this, we should use '*hive catalog implementation*'
- Use '*enableHiveSupport()*' while creating Spark session itself, if it is not enabled by default 
- If a hive catalog is implemented, the metastore/datastore is persisted in a particular location.

In [0]:
# Show databases
db = spark.sql("SHOW databases")
db.show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [0]:
spark.sql("Show tables in default").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [0]:
# Read EMP CSV file with 10 million records
emp_schema = "first_name string, last_name string, job_title string, dob date, email string, phone string, salary double, department string, department_id integer"
emp = spark.read.schema(emp_schema).option("header",True).csv("/data/input/datasets/employee_recs.csv")

In [0]:
# Read DEPT CSV file with 10 records
dept_schema ="department_id int, department_name string, description string, city string, state string, country string "
dept = spark.read.schema(dept_schema).option("header",True).csv("/data/input/datasets/department_recs.csv")

In [0]:
# Register dataframes are temp views
emp.createOrReplaceTempView("empview")
dept.createOrReplaceTempView("deptview")


In [0]:
spark.sql("Show tables in default").show()  # Now you will see two views here

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | deptview|       true|
|        |  empview|       true|
+--------+---------+-----------+



> NOTE:
- '*isTemporary= true'* --> means it is a view not a table
- That also implies that once that particular active session is lost those views will also be lost.

In [0]:
# View data from Table -- triple quotes for multi-line

empdf= spark.sql("""
          SELECT * 
          FROM empview
          WHERE department_id=1
""")

empdf.show(truncate=False, n=5)

+-------------+------------+--------------+----------+-----------------------+------------+---------+----------+-------------+
|first_name   |last_name   |job_title     |dob       |email                  |phone       |salary   |department|department_id|
+-------------+------------+--------------+----------+-----------------------+------------+---------+----------+-------------+
|FirstName_282|LastName_282|Data Analyst  |1996-09-02|user1249282@example.com|+18214812697|71801.39 |Data      |1            |
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@example.com|+14644924775|86367.71 |Data      |1            |
|FirstName_306|LastName_306|Data Analyst  |1988-03-14|user1249306@example.com|+12338438014|95641.32 |Data      |1            |
|FirstName_346|LastName_346|HR Manager    |1963-11-03|user1249346@example.com|+14531761440|110552.52|Data      |1            |
|FirstName_375|LastName_375|HR Specialist |1976-09-03|user1249375@example.com|+14896803254|48903.01 |Data      

In [0]:
# Create a new column dob_year and register s temp view
emptemp = spark.sql("""
          SELECT e.*, date_format(dob, 'yyyy') AS dob_year
          FROM empview e
          WHERE department_id=1
""")
emptemp.show(n=5)


+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+
|FirstName_282|LastName_282|  Data Analyst|1996-09-02|user1249282@examp...|+18214812697| 71801.39|      Data|            1|    1996|
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@examp...|+14644924775| 86367.71|      Data|            1|    1985|
|FirstName_306|LastName_306|  Data Analyst|1988-03-14|user1249306@examp...|+12338438014| 95641.32|      Data|            1|    1988|
|FirstName_346|LastName_346|    HR Manager|1963-11-03|user1249346@examp...|+14531761440|110552.52|      Data|            1|    1963|
|FirstName_375|LastName_375| HR Specialist|1976-09-03|user1249375@exa

In [0]:
emptemp.createOrReplaceTempView("emptempview")
spark.sql("Show tables in default").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
|        |   deptview|       true|
|        |emptempview|       true|
|        |    empview|       true|
+--------+-----------+-----------+



In [0]:
spark.sql("SELECT * FROM emptempview").show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+
|FirstName_282|LastName_282|  Data Analyst|1996-09-02|user1249282@examp...|+18214812697| 71801.39|      Data|            1|    1996|
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@examp...|+14644924775| 86367.71|      Data|            1|    1985|
|FirstName_306|LastName_306|  Data Analyst|1988-03-14|user1249306@examp...|+12338438014| 95641.32|      Data|            1|    1988|
|FirstName_346|LastName_346|    HR Manager|1963-11-03|user1249346@examp...|+14531761440|110552.52|      Data|            1|    1963|
|FirstName_375|LastName_375| HR Specialist|1976-09-03|user1249375@exa

In [0]:
# JOIN emp and dept - HINTS
dfjoined = spark.sql("""
          SELECT e.*,d.department_name
          FROM emptempview e LEFT OUTER JOIN deptview d
          ON e.department_id=d.department_id""")
dfjoined.show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|department_name|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|FirstName_282|LastName_282|  Data Analyst|1996-09-02|user1249282@examp...|+18214812697| 71801.39|      Data|            1|    1996|           Data|
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@examp...|+14644924775| 86367.71|      Data|            1|    1985|           Data|
|FirstName_306|LastName_306|  Data Analyst|1988-03-14|user1249306@examp...|+12338438014| 95641.32|      Data|            1|    1988|           Data|
|FirstName_346|LastName_346|    HR Manager|1963-11-03|user1249346@examp...|+14531761440|110552.52|      Da

If you see below, by default it is using Broadcasting(BroadcastHashJoin) on department view(smaller view), because the AQE is taking care of the Spark SQL.

In [0]:
dfjoined.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [first_name#104, last_name#105, job_title#106, dob#107, email#108, phone#109, salary#110, department#111, department_id#112, dob_year#833, department_name#123]
   +- BroadcastHashJoin [department_id#112], [department_id#122], LeftOuter, BuildRight, false, true
      :- Project [first_name#104, last_name#105, job_title#106, dob#107, email#108, phone#109, salary#110, department#111, department_id#112, date_format(cast(dob#107 as timestamp), yyyy, Some(Etc/UTC)) AS dob_year#833]
      :  +- Filter (isnotnull(department_id#112) AND (department_id#112 = 1))
      :     +- FileScan csv [first_name#104,last_name#105,job_title#106,dob#107,email#108,phone#109,salary#110,department#111,department_id#112] Batched: false, DataFilters: [isnotnull(department_id#112), (department_id#112 = 1)], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/data/input/datasets/employee_recs.csv], PartitionFilters: [], PushedFilters: [IsNotNull

> Now consider, if you dont want to use a BROADCAST JOIN but you want to use a SHUFFLE JOIN, then we need to put a HINT! See it below

In [0]:
dfjoined = spark.sql("""
          SELECT /*+ SHUFFLE_MERGE(e) */    -- HINT to use SortMergeJoin(Shuffle Join)
          e.*,d.department_name
          FROM emptempview e LEFT OUTER JOIN deptview d
          ON e.department_id=d.department_id""")
dfjoined.show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|department_name|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|FirstName_282|LastName_282|  Data Analyst|1996-09-02|user1249282@examp...|+18214812697| 71801.39|      Data|            1|    1996|           Data|
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@examp...|+14644924775| 86367.71|      Data|            1|    1985|           Data|
|FirstName_306|LastName_306|  Data Analyst|1988-03-14|user1249306@examp...|+12338438014| 95641.32|      Data|            1|    1988|           Data|
|FirstName_346|LastName_346|    HR Manager|1963-11-03|user1249346@examp...|+14531761440|110552.52|      Da

In [0]:
dfjoined = spark.sql("""
          SELECT /*+ BROADCAST(d) */    -- HINT to use Broadcast, it will use BroadcastHashJoin now
          e.*,d.department_name
          FROM emptempview e LEFT OUTER JOIN deptview d
          ON e.department_id=d.department_id""")
dfjoined.show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|department_name|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|FirstName_282|LastName_282|  Data Analyst|1996-09-02|user1249282@examp...|+18214812697| 71801.39|      Data|            1|    1996|           Data|
|FirstName_288|LastName_288|Senior Manager|1985-05-26|user1249288@examp...|+14644924775| 86367.71|      Data|            1|    1985|           Data|
|FirstName_306|LastName_306|  Data Analyst|1988-03-14|user1249306@examp...|+12338438014| 95641.32|      Data|            1|    1988|           Data|
|FirstName_346|LastName_346|    HR Manager|1963-11-03|user1249346@examp...|+14531761440|110552.52|      Da

In [0]:
# Write the Data as table

dfjoined.write.format("parquet").saveAsTable("empfinaltable")

# Check Database Tables --> default --> empfinaltable created there

In [0]:
spark.sql("Show tables in default").show()

+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| default|empfinaltable|      false|
|        |     deptview|       true|
|        |  emptempview|       true|
|        |      empview|       true|
+--------+-------------+-----------+



By default, for the above table (empfinaltable) created:
- Spark saves tables in Hive warehouse directory, typically at: /user/hive/warehouse/empfinaltable/ (You will find this folder there)
- You can also see that '*isTemporary=false*', which means it is a Tabl now and not a View

In [0]:
# READ the data from the table
empnew = spark.read.table("empfinaltable")
empnew.show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|department_name|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|FirstName_778|LastName_778|Senior Manager|1971-12-04|user7499778@examp...|+13461914433|137993.09|      Data|            1|    1971|           Data|
|FirstName_783|LastName_783|     Team Lead|2013-03-01|user7499783@examp...|+14660611852| 89633.42|      Data|            1|    2013|           Data|
|FirstName_787|LastName_787| HR Specialist|1974-01-16|user7499787@examp...|+14538342783| 90809.83|      Data|            1|    1974|           Data|
|FirstName_790|LastName_790|  Data Analyst|1993-02-26|user7499790@examp...|+13024777044| 62901.98|      Da

### OR

In [0]:
empnew = spark.sql("SELECT * FROM empfinaltable")
empnew.show(n=5)

+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|   first_name|   last_name|     job_title|       dob|               email|       phone|   salary|department|department_id|dob_year|department_name|
+-------------+------------+--------------+----------+--------------------+------------+---------+----------+-------------+--------+---------------+
|FirstName_778|LastName_778|Senior Manager|1971-12-04|user7499778@examp...|+13461914433|137993.09|      Data|            1|    1971|           Data|
|FirstName_783|LastName_783|     Team Lead|2013-03-01|user7499783@examp...|+14660611852| 89633.42|      Data|            1|    2013|           Data|
|FirstName_787|LastName_787| HR Specialist|1974-01-16|user7499787@examp...|+14538342783| 90809.83|      Data|            1|    1974|           Data|
|FirstName_790|LastName_790|  Data Analyst|1993-02-26|user7499790@examp...|+13024777044| 62901.98|      Da

In [0]:
# SHOW details of Metadata
spark.sql("DESCRIBE empfinaltable").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|     first_name|   string|   null|
|      last_name|   string|   null|
|      job_title|   string|   null|
|            dob|     date|   null|
|          email|   string|   null|
|          phone|   string|   null|
|         salary|   double|   null|
|     department|   string|   null|
|  department_id|      int|   null|
|       dob_year|   string|   null|
|department_name|   string|   null|
+---------------+---------+-------+



In [0]:
# See all metadats information using 'extended'
spark.sql("DESCRIBE extended empfinaltable").show(truncate=False, n=50)


+----------------------------+---------------------------------------------------------+-------+
|col_name                    |data_type                                                |comment|
+----------------------------+---------------------------------------------------------+-------+
|first_name                  |string                                                   |null   |
|last_name                   |string                                                   |null   |
|job_title                   |string                                                   |null   |
|dob                         |date                                                     |null   |
|email                       |string                                                   |null   |
|phone                       |string                                                   |null   |
|salary                      |double                                                   |null   |
|department                  |