In [2]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark SQL")
    .master("local[*]")
    .enableHiveSupport()
    .config("spark.sql.warehouse.dir", "/data/output/spark-warehouse")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/26 21:37:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format('csv').schema(_schema).option("header", True).load("data/input/employee_records.csv")

emp.show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|512653.0|            8|
|     Bobby| Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|(750)846-1602x7458|999836.0|            7|
|    Dennis|   Norman|Land/geomatics su...|1990-06-24| jturner@example.net|  873.820.0518x825|131900.0|           10|
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1|
|  Michelle|  Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|     (705)900-5337|604738.0|            8|
+----------+---------+--------------------+----------+--

In [4]:
# Read DEPT CSV data

_dept_schema = "department_id string, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("data/input/department_data.csv")

dept.show(5)

+-------------+--------------------+--------------------+-------------+-----+-------------------+
|department_id|     department_name|         description|         city|state|            country|
+-------------+--------------------+--------------------+-------------+-----+-------------------+
|            1|         Bryan-James|Optimized disinte...| Melissaburgh|   FM|Trinidad and Tobago|
|            2|Smith, Craig and ...|Digitized empower...|   Morrisside|   DE|          Sri Lanka|
|            3|Pittman, Hess and...|Multi-channeled c...|  North David|   SC|       Turkmenistan|
|            4|Smith, Snyder and...|Reactive neutral ...|Lake Jennifer|   TX|         Madagascar|
|            5|          Hardin Inc|Re-contextualized...|    Hayestown|   WA|               Fiji|
+-------------+--------------------+--------------------+-------------+-----+-------------------+
only showing top 5 rows



In [5]:
# Spark Catalog (Metadata) - in-memory/hive
# without .enableHiveSupport() in sparkSession this config will return 'in-memory'

spark.conf.get("spark.sql.catalogImplementation")

'hive'

In [6]:
# Show databases

db = spark.sql("show databases")
db.show()

spark.sql("show tables in default").show()

25/01/26 21:37:25 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/01/26 21:37:25 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/01/26 21:37:32 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/01/26 21:37:32 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore vrushabh.deokar@192.168.1.34


+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



25/01/26 21:37:32 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


In [7]:
# Register dataframes are temp views
emp.createOrReplaceTempView("emp_view")
dept.createOrReplaceTempView("dept_view")

spark.sql("show tables in default").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |dept_view|       true|
|         | emp_view|       true|
+---------+---------+-----------+



In [7]:
# Show tables/view in catalog

In [8]:
# View data from table

emp_filtered = spark.sql("""select * from emp_view""")
emp_filtered.show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|512653.0|            8|
|     Bobby| Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|(750)846-1602x7458|999836.0|            7|
|    Dennis|   Norman|Land/geomatics su...|1990-06-24| jturner@example.net|  873.820.0518x825|131900.0|           10|
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1|
|  Michelle|  Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|     (705)900-5337|604738.0|            8|
+----------+---------+--------------------+----------+--

In [9]:
# View data from table

dept_filtered = spark.sql("""select * from dept_view""")
dept_filtered.show(5)

+-------------+--------------------+--------------------+-------------+-----+-------------------+
|department_id|     department_name|         description|         city|state|            country|
+-------------+--------------------+--------------------+-------------+-----+-------------------+
|            1|         Bryan-James|Optimized disinte...| Melissaburgh|   FM|Trinidad and Tobago|
|            2|Smith, Craig and ...|Digitized empower...|   Morrisside|   DE|          Sri Lanka|
|            3|Pittman, Hess and...|Multi-channeled c...|  North David|   SC|       Turkmenistan|
|            4|Smith, Snyder and...|Reactive neutral ...|Lake Jennifer|   TX|         Madagascar|
|            5|          Hardin Inc|Re-contextualized...|    Hayestown|   WA|               Fiji|
+-------------+--------------------+--------------------+-------------+-----+-------------------+
only showing top 5 rows



In [10]:
# Create a new column dob_year and register as temp view

emp_temp = spark.sql("""select e.*, date_format(dob, 'yyyy') as dob_year from emp_view e where department_id = 1""")
emp_temp.createOrReplaceTempView("emp_temp_view")

In [11]:
# Join emp and dept - HINTs
# Spark automatically do broadCast Join on smaller dataset

emp_final = spark.sql("""
    select /*+ BROADCAST(d) */
    e.* , d.department_name
    from emp_view e left outer join dept_view d
    on e.department_id = d.department_id
""")

emp_final.show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+---------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|department_name|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+---------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|512653.0|            8|     Parker PLC|
|     Bobby| Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|(750)846-1602x7458|999836.0|            7|    Ward-Gordon|
|    Dennis|   Norman|Land/geomatics su...|1990-06-24| jturner@example.net|  873.820.0518x825|131900.0|           10| Delgado-Keller|
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1|    Bryan-James|
|  Michelle|  Elliott|      Air cabin crew|1975-03-31|tiffanyj

In [12]:
# Join emp and dept - HINTs

emp_final_1 = spark.sql("""
    select /*+ SHUFFLE_MERGE(e) */
    e.* , d.department_name
    from emp_view e left outer join dept_view d
    on e.department_id = d.department_id
""")

emp_final_1.show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+---------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|department_name|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+---------------+
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1|    Bryan-James|
|     Jacob|    Stark|         Fine artist|1976-04-25|jasonortiz@exampl...|224-695-9516x02171|358889.0|            1|    Bryan-James|
|      Karl|     Kent|          Geochemist|1966-03-16|yjohnson@example.org|     (375)285-4892|717373.0|            1|    Bryan-James|
|   Phillip|     Sims|Trading standards...|1979-04-03|jasonmarquez@exam...|      697.201.2204|326537.0|            1|    Bryan-James|
|  Benjamin|   Norton| Art gallery manager|1976-12-06| debra86

In [11]:
# Show emp data

In [14]:
# Write the data as Table

emp_final.write.format("parquet").saveAsTable("emp_final")

In [13]:
# Read the data from Table

emp_new = spark.sql("select * from emp_final")
emp_new.show()

In [14]:
# Persist metadata

In [16]:
# Show details of metadata
spark.sql("describe extended dept_view").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|  department_id|   string|   NULL|
|department_name|   string|   NULL|
|    description|   string|   NULL|
|           city|   string|   NULL|
|          state|   string|   NULL|
|        country|   string|   NULL|
+---------------+---------+-------+



In [18]:
spark.stop()