In [35]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark SQL")
    .master("local[*]")
    .enableHiveSupport()
    .config("spark.sql.warehouse.dir", "/data/output/spark-warehouse")
    .getOrCreate()
)

spark

In [2]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(r"C:\Users\vivek\Downloads\pyspark_lecture_tutorial\data\employee_records.csv")

In [3]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load(r"C:\Users\vivek\Downloads\pyspark_lecture_tutorial\data\department_data.csv")

In [4]:
# Spark Catalog (Metadata) - in-memory/hive

spark.conf.get("spark.sql.catalogImplementation")

'hive'

In [5]:
# Show databases
db = spark.sql("show databases")
db.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [6]:
spark.sql("show tables in default").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [10]:
spark.sql("show tables in default").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |dept_view|       true|
|         | emp_view|       true|
+---------+---------+-----------+



In [9]:
# Register dataframes are temp views

emp.createOrReplaceTempView("emp_view")

dept.createOrReplaceTempView("dept_view")

## Show tables/view in catalog

In [11]:
# View data from table

emp_filtered = spark.sql("""
    select * from emp_view
    where department_id = 1
""")

In [12]:
emp_filtered.show()

+----------+---------+---------+---+-----+-----+------+-------------+
|first_name|last_name|job_title|dob|email|phone|salary|department_id|
+----------+---------+---------+---+-----+-----+------+-------------+
+----------+---------+---------+---+-----+-----+------+-------------+



In [13]:
# Create a new column dob_year and register as temp view

emp_temp = spark.sql("""
    select e.*, date_format(dob, 'yyyy') as dob_year from emp_view e
""")

In [14]:
emp_temp.createOrReplaceTempView("emp_temp_view")

In [15]:

spark.sql("select * from emp_temp_view").show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|dob_year|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|         null|    1973|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|         null|    1974|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|         null|    1990|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|         null|    1968|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|       (705)90

In [16]:
# Join emp and dept - HINTs

emp_final = spark.sql("""
    select /*+ BROADCAST(d) */
    e.* , d.department_name
    from emp_view e left outer join dept_view d
    on e.department_id = d.department_id
""")

In [17]:
# Show emp data

emp_final.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+---------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|department_name|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+---------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|         null|           null|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|         null|           null|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|         null|           null|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|         null|           null|
|  Michelle|   Elliott|      Air cabin cr

In [19]:
# Write the data as Table

emp_final.write.format("parquet").saveAsTable("emp_final")

In [30]:
# Read the data from Table

emp_new = spark.sql("select * from emp_final")

In [None]:
emp_new.show()

In [None]:
# Persist metadata

In [None]:
# Show details of metadata

spark.sql("describe extended emp_final").show()