Spark SQL

In [2]:
import logging
from pyspark.sql import SparkSession

In [3]:
def rdd_to_dataframe(data, schema):
    """
    Example: This fn creates a Spark RDD, loads it into a Spark DataFrame, and returns the DataFrame 
    """
        
    # Create a SparkSession
    spark = SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

    try:
        # Create an RDD from the input data, using Spark Context not Session!
        rdd = spark.sparkContext.parallelize(data)

        # Convert RDD to DataFrame
        df = spark.createDataFrame(rdd, schema)

        # Return the DataFrame, without stopping the SparkSession
        return df

    except Exception as e:
        # Log error and Stop the SparkSession
        logging.error('Error while transforming RDD to DF: {}'.format(e))
        spark.stop()


In [4]:
# Data sample
dept_data = [(1,"Big Data"), (2, "Finance"), (3,"Marketing")]
dept_schema = ["department_id", "department_name"]
# Data sample
emp_data = [(1,"ginny", 17), (1,"bobby", 30), (2,"Jasmin", 26),(3,"Nishi", 36)]
emp_schema = ["department_id","employee_name", "age"]

In [5]:
# Call function, to transform RDD into DF
df_emp = rdd_to_dataframe(emp_data, emp_schema)
df_dept = rdd_to_dataframe(dept_data, dept_schema)

24/03/17 10:01:11 WARN Utils: Your hostname, sasa-1-2 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/03/17 10:01:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 10:01:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [8]:
# Do we have a session running?
spark = SparkSession.builder.appName("RDDToDataFrame").getOrCreate()

In [9]:
# Register as view
df_emp.createOrReplaceTempView('employees')
df_dept.createOrReplaceTempView('departments')

In [10]:
# Query sample, using Spark SQL
spark.sql('''
            select emp.*, dept.*
            from employees as emp
                inner join departments as dept on (emp.department_id = dept.department_id) 
            ''').show()

                                                                                

+-------------+-------------+---+-------------+---------------+
|department_id|employee_name|age|department_id|department_name|
+-------------+-------------+---+-------------+---------------+
|            1|        ginny| 17|            1|       Big Data|
|            1|        bobby| 30|            1|       Big Data|
|            2|       Jasmin| 26|            2|        Finance|
|            3|        Nishi| 36|            3|      Marketing|
+-------------+-------------+---+-------------+---------------+



In [11]:
# Let's now save the JOINED RESULTSET into a new Temporary View -- NO WHERE CLAUSE
spark.sql('''
        select emp.employee_name, emp.age, emp.department_id, dept.department_name
        from employees as emp
            inner join departments as dept on (emp.department_id = dept.department_id)
        ''').createOrReplaceTempView('dept_employees')

Join a third dataset, but with a different format

In [23]:
# Let's load the data first
df_budgets = spark.read.option("multiline","true").json('/home/sasa/Downloads/json/department_budgets.json')

In [24]:
# Show schema
df_budgets.printSchema()

root
 |-- budget: long (nullable = true)
 |-- budget_authorizer: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- cto: struct (nullable = true)
 |    |    |    |-- last_name: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- office: string (nullable = true)
 |-- budget_period: string (nullable = true)
 |-- department_id: long (nullable = true)
 |-- offices: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- cost_center: struct (nullable = true)
 |    |    |    |-- budget_status: string (nullable = true)
 |    |    |    |-- office: string (nullable = true)



In [25]:
# We can still query the data, using Json paths: 
df_budgets.select('offices').where('department_id == 1').show(truncate=False)

+-------------------------------------------------------------------------+
|offices                                                                  |
+-------------------------------------------------------------------------+
|[{{denied, new york}}, {{approved, mumbai}}, {{approved, san francisco}}]|
+-------------------------------------------------------------------------+



In [26]:
df_budgets.show()

+------+--------------------+-------------+-------------+--------------------+
|budget|   budget_authorizer|budget_period|department_id|             offices|
+------+--------------------+-------------+-------------+--------------------+
| 16000|                NULL|         year|            1|[{{denied, new yo...|
| 23000|[{{doe, joe, new ...|         year|            2|                NULL|
| 12000|                NULL|         year|            3|                NULL|
+------+--------------------+-------------+-------------+--------------------+



Querying without flattening JSON

In [27]:
# Register as Temporary View
df_budgets.createOrReplaceTempView('budgets_json')

This could lead to Spaghetti code!! 🚨
i.e.

What if bud.offices have a variable number of items?

In [28]:
# Let's join the third dataset. 
spark.sql('''
          select  emp.employee_name, 
                  emp.department_id, 
                  bud.budget, 
                  bud.budget_period, 
                  bud.offices[0].cost_center.office as office_1,
                  bud.offices[0].cost_center.budget_status as budget_status_1,
                  bud.offices[1].cost_center.office as office_2,
                  bud.offices[1].cost_center.budget_status as budget_status_2,
                  bud.offices[2].cost_center.office as office_3,
                  bud.offices[2].cost_center.budget_status as budget_status_3,
                  nvl(bud.budget_authorizer[0].cto.name,"no CTO registered")  as cto_name,
                  nvl(bud.budget_authorizer[0].cto.last_name,"no CTO registered") as cto_last_name
          from dept_employees as emp
            inner join budgets_json as bud on (emp.department_id = bud.department_id)
          ''').show()

                                                                                

+-------------+-------------+------+-------------+--------+---------------+--------+---------------+-------------+---------------+-----------------+-----------------+
|employee_name|department_id|budget|budget_period|office_1|budget_status_1|office_2|budget_status_2|     office_3|budget_status_3|         cto_name|    cto_last_name|
+-------------+-------------+------+-------------+--------+---------------+--------+---------------+-------------+---------------+-----------------+-----------------+
|        ginny|            1| 16000|         year|new york|         denied|  mumbai|       approved|san francisco|       approved|no CTO registered|no CTO registered|
|        bobby|            1| 16000|         year|new york|         denied|  mumbai|       approved|san francisco|       approved|no CTO registered|no CTO registered|
|       Jasmin|            2| 23000|         year|    NULL|           NULL|    NULL|           NULL|         NULL|           NULL|              joe|              doe

Flattening JSON into Columnar format is normally easier, cleaner and more scalable.
Suggestion: always test and benchmark performance, to compare Json Paths access vs. flattening

In [29]:
# - Import required libs. These should be on top
import logging
from pyspark.sql.types import ArrayType, StructType
from pyspark.sql.functions import explode_outer, col

There are tons of different approaches to flatten/explode JSON. This is just an example
Credits: Function adapted from nmukerje/Pyspark Flatten json repo in GitHub

In [32]:
def flatten_dataframe(df):
    """
    Spark function to flatten nested structs. Function adapted from GitHub: https://bit.ly/43ZegOL
    :param df: Spark dataframe with semi-structured types, such as StructType or ArrayType

    :return: Spark dataframe
    """    
    try:
        # compute Complex Fields (Lists and Structs) in Schema   
        complex_fields = dict([(field.name, field.dataType)
                                    for field in df.schema.fields
                                    if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
            
        while len(complex_fields)!=0:
            col_name=list(complex_fields.keys())[0]
            
            # if StructType then convert all sub element to columns.
            # i.e. flatten structs
            if (type(complex_fields[col_name]) == StructType):
                expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
                df=df.select("*", *expanded).drop(col_name)
            
            # if ArrayType then add the Array Elements as Rows using the explode function
            # i.e. explode Arrays
            elif (type(complex_fields[col_name]) == ArrayType):    
                df=df.withColumn(col_name, explode_outer(col_name))
            
            # recompute remaining Complex Fields in Schema       
            complex_fields = dict([(field.name, field.dataType)
                                    for field in df.schema.fields
                                    if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
        return df
    
    except Exception as e:
        logging.error('Error while flattening JSON data: {}'.format(e))

In [33]:
# Let's now flatten the data.
df_budgets_flat = flatten_dataframe(df_budgets)

 Data Duplication when unnesting data!

In [34]:
# Show sample
df_budgets_flat.show()

+------+-------------+-------------+-------------------------------+--------------------------+----------------------------+---------------------------------+--------------------------+
|budget|budget_period|department_id|budget_authorizer_cto_last_name|budget_authorizer_cto_name|budget_authorizer_cto_office|offices_cost_center_budget_status|offices_cost_center_office|
+------+-------------+-------------+-------------------------------+--------------------------+----------------------------+---------------------------------+--------------------------+
| 16000|         year|            1|                           NULL|                      NULL|                        NULL|                           denied|                  new york|
| 16000|         year|            1|                           NULL|                      NULL|                        NULL|                         approved|                    mumbai|
| 16000|         year|            1|                           NULL|  

In [35]:
# New flatten schema
df_budgets_flat.printSchema()

root
 |-- budget: long (nullable = true)
 |-- budget_period: string (nullable = true)
 |-- department_id: long (nullable = true)
 |-- budget_authorizer_cto_last_name: string (nullable = true)
 |-- budget_authorizer_cto_name: string (nullable = true)
 |-- budget_authorizer_cto_office: string (nullable = true)
 |-- offices_cost_center_budget_status: string (nullable = true)
 |-- offices_cost_center_office: string (nullable = true)



In [36]:
# Same process: register JSON DF as Temporary View
df_budgets_flat.createOrReplaceTempView('budgets_flat')

In [37]:
# Let's join the third dataset
spark.sql('''
          select *
          from dept_employees
          ''').show()

+-------------+---+-------------+---------------+
|employee_name|age|department_id|department_name|
+-------------+---+-------------+---------------+
|        ginny| 17|            1|       Big Data|
|        bobby| 30|            1|       Big Data|
|       Jasmin| 26|            2|        Finance|
|        Nishi| 36|            3|      Marketing|
+-------------+---+-------------+---------------+



In [38]:
# Let's join the third dataset
spark.sql('''
          select emp.department_id, emp.employee_name, emp.department_name, bud.budget, bud.budget_period, bud.offices_cost_center_office, bud.budget_authorizer_cto_name
          from dept_employees as emp
            inner join budgets_flat as bud on (emp.department_id = bud.department_id)
          ''').show(n=50)

+-------------+-------------+---------------+------+-------------+--------------------------+--------------------------+
|department_id|employee_name|department_name|budget|budget_period|offices_cost_center_office|budget_authorizer_cto_name|
+-------------+-------------+---------------+------+-------------+--------------------------+--------------------------+
|            1|        ginny|       Big Data| 16000|         year|             san francisco|                      NULL|
|            1|        ginny|       Big Data| 16000|         year|                    mumbai|                      NULL|
|            1|        ginny|       Big Data| 16000|         year|                  new york|                      NULL|
|            1|        bobby|       Big Data| 16000|         year|             san francisco|                      NULL|
|            1|        bobby|       Big Data| 16000|         year|                    mumbai|                      NULL|
|            1|        bobby|   