# **What is SparkSession?**

SparkSession is the entry point to any Spark functionality. It's a unified entry point for Spark applications, replacing the older SparkContext, HiveContext, and SQLContext. SparkSession provides a single point of entry for Spark applications, making it easier to work with Spark.

Accessing Hive Table using SparkContext, HiveContext, and SQLContext
Here's an example code that demonstrates how to access a Hive table using SparkContext, HiveContext, and SQLContext:


In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan,when,count,col,lit
from pyspark.sql.types import *
import pandas as pd
import numpy as np
spark = SparkSession.builder \
    .master("local") \
    .appName("sql_learning") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
#loading the data
df_employee=pd.read_csv("https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv")
df_department=pd.read_csv("https://gist.githubusercontent.com/kevin336/5ea0e96813aa88871c20d315b5bf445c/raw/d8fcf5c2630ba12dd8802a2cdd5480621b6a0ea6/departments.csv")
df_banking=pd.read_csv("https://raw.githubusercontent.com/Saswat956/Data/main/banking.csv")
def create_hive_table(dataframe,Tablename='default_table'):
  structure=pd.DataFrame(list(dataframe.columns),columns=['columnname'])
  structure.loc[:,'datatype']=dataframe.dtypes.to_dict().values()
  structure.loc[:,'flag']=True
  structure['datatype_new']=np.where(structure['datatype']=='object',StringType(),0)
  structure['datatype_new']=np.where(structure['datatype']=='float64',FloatType(),structure['datatype_new'])
  structure['datatype_new']=np.where(structure['datatype']=='int64',IntegerType(),structure['datatype_new'])
  structure=structure[['columnname','datatype_new','flag']]
  xlist=[StructField(structure['columnname'][i],structure['datatype_new'][i],True) for i in range(len(structure))]
  sparkdf=spark.createDataFrame(dataframe,schema=StructType(xlist))
  sparkdf.createOrReplaceTempView(f'{Tablename}')

#create employee table
create_hive_table(df_employee,Tablename='employee')
#create department table
create_hive_table(df_department,Tablename='department')
#create banking table
create_hive_table(df_banking,Tablename='banking')

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=ea0e2cbf23caf70cd5bdd77306c9e29593d5f1ddf64b6c27fd3c2e97f7a99971
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


**Using Spark SQL**

In [None]:

# Accessing Hive table using SparkSession (recommended)
spark.sql("SELECT * FROM employee").show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


In [None]:
spark.table('employee').show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


Using SQLContext

In [None]:
# Accessing Hive table using SQLContext (older approach)
spark_context = spark.sparkContext
from pyspark.sql import SQLContext
sql_context = SQLContext(spark_context)
sql_context.sql("SELECT * FROM employee").show()



+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


Using HiveContext

In [None]:
from pyspark.sql import HiveContext

# Accessing Hive table using SparkContext (older approach)
spark_context = spark.sparkContext
hive_context = HiveContext(spark_context)
hive_context.sql("SELECT * FROM employee").show()

# Stop the SparkSession
spark.stop()



+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|
