In [0]:
# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("UserDefinedFunction")
        .getOrCreate()
)
spark


In [0]:
master_url = spark.sparkContext.master
if master_url.startswith("spark://"):
    print("Spark Master URL:", master_url)
else:
    print("Not connected to a Spark standalone master.")

Not connected to a Spark standalone master.


In [0]:
# Read Employee data
schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"
emp = spark.read.option("header",True).schema(schema).csv("/data/output/1/emp.csv")
emp.rdd.getNumPartitions()

Out[4]: 7

In [0]:
# Function to generate 10% of the Salary as Bonus
def bonus(salary):
    return int(salary)*0.1

In [0]:
# Reguster an UDF
from pyspark.sql.functions import udf
bonus_udf=udf(bonus) # This is only available for the dataframe API

In [0]:
# Create new column as bonus UDF
from pyspark.sql.functions import expr
emp.withColumn("bonus", bonus_udf("salary")).show()

+-----------+-------------+-----------+---+------+------+----------+------+
|employee_id|department_id|       name|age|gender|salary| hire_date| bonus|
+-----------+-------------+-----------+---+------+------+----------+------+
|        019|          103|Steven Chen| 36|  Male| 62000|2015-08-01|6200.0|
|        020|          102|  Grace Kim| 32|Female| 53000|2018-11-01|5300.0|
|        008|          102|   Kate Kim| 29|Female| 51000|2019-10-01|5100.0|
|        009|          103|    Tom Tan| 33|  Male| 58000|2016-06-01|5800.0|
|        012|          105| Susan Chen| 31|Female| 54000|2017-02-15|5400.0|
|        006|          103|  Jill Wong| 32|Female| 52000|2018-07-01|5200.0|
+-----------+-------------+-----------+---+------+------+----------+------+



- What if we need to use UDF with Spark SQL or Spark SQL expressions? 
- For that, we need to register it in Scala as shown below.
- Once that UDF is registered it is available to be used in any language. You can write it in expressions to get the work done


In [0]:
spark.udf.register("bonus_sql_udf", bonus, "double")

Out[13]: <function __main__.bonus(salary)>

In [0]:
# Create new column as bonus UDF
from pyspark.sql.functions import expr
emp.withColumn("bonus", expr("bonus_sql_udf(salary)")).show()

+-----------+-------------+-----------+---+------+------+----------+------+
|employee_id|department_id|       name|age|gender|salary| hire_date| bonus|
+-----------+-------------+-----------+---+------+------+----------+------+
|        019|          103|Steven Chen| 36|  Male| 62000|2015-08-01|6200.0|
|        020|          102|  Grace Kim| 32|Female| 53000|2018-11-01|5300.0|
|        008|          102|   Kate Kim| 29|Female| 51000|2019-10-01|5100.0|
|        009|          103|    Tom Tan| 33|  Male| 58000|2016-06-01|5800.0|
|        012|          105| Susan Chen| 31|Female| 54000|2017-02-15|5400.0|
|        006|          103|  Jill Wong| 32|Female| 52000|2018-07-01|5200.0|
+-----------+-------------+-----------+---+------+------+----------+------+



> NOTE: It is only recommended to use python UDF only when it is utmost necessary, since it is an expensive process

In [0]:
# Create new column as bonus without UDF
emp.withColumn("bonus", expr("salary * 0.1 ")).show()

+-----------+-------------+-----------+---+------+------+----------+------+
|employee_id|department_id|       name|age|gender|salary| hire_date| bonus|
+-----------+-------------+-----------+---+------+------+----------+------+
|        019|          103|Steven Chen| 36|  Male| 62000|2015-08-01|6200.0|
|        020|          102|  Grace Kim| 32|Female| 53000|2018-11-01|5300.0|
|        008|          102|   Kate Kim| 29|Female| 51000|2019-10-01|5100.0|
|        009|          103|    Tom Tan| 33|  Male| 58000|2016-06-01|5800.0|
|        012|          105| Susan Chen| 31|Female| 54000|2017-02-15|5400.0|
|        006|          103|  Jill Wong| 32|Female| 52000|2018-07-01|5200.0|
+-----------+-------------+-----------+---+------+------+----------+------+



- You will see same result above without UDF. 
- But for this you will not have python process created.
- It is happening within the JVM itself which is the executor.