In [34]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("UDF")
    .master("local[*]")
    .config("spark.executor.cores", 2)
    .config("spark.cores.max", 6)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

25/01/12 13:05:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [45]:
# Read employee data

# emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

emp = spark.read.format("parquet").load("data/output/11/3/emp.parquet")

emp.rdd.getNumPartitions()

                                                                                

7

In [46]:
emp.show()

+-----------+-------------+---+------+------+----------+-------------+
|employee_id|         name|age|gender|salary| hire_date|department_id|
+-----------+-------------+---+------+------+----------+-------------+
|        003|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|
|        004|    Alice Lee| 28|Female| 48000|2017-09-30|          102|
|        008|     Kate Kim| 29|Female| 51000|2019-10-01|          102|
|        020|    Grace Kim| 32|Female| 53000|2018-11-01|          102|
|        005|    Jack Chan| 40|  Male| 60000|2013-04-01|          103|
|        006|    Jill Wong| 32|Female| 52000|2018-07-01|          103|
|        009|      Tom Tan| 33|  Male| 58000|2016-06-01|          103|
|        019|  Steven Chen| 36|  Male| 62000|2015-08-01|          103|
|        001|     John Doe| 30|  Male| 50000|2015-01-01|          101|
|        002|   Jane Smith| 25|Female| 45000|2016-02-15|          101|
|        007|James Johnson| 42|  Male| 70000|2012-03-15|          101|
|     

In [47]:
# Create a function to generate 10% of Salary as Bonus
import time
def bonus(salary):
    return int(salary) * 0.1

In [48]:
from pyspark.sql.functions import udf

bonus_udf = udf(bonus)

In [49]:
emp.withColumn("bonus", bonus_udf("salary")).show()

+-----------+-------------+---+------+------+----------+-------------+------+
|employee_id|         name|age|gender|salary| hire_date|department_id| bonus|
+-----------+-------------+---+------+------+----------+-------------+------+
|        003|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|5500.0|
|        004|    Alice Lee| 28|Female| 48000|2017-09-30|          102|4800.0|
|        008|     Kate Kim| 29|Female| 51000|2019-10-01|          102|5100.0|
|        020|    Grace Kim| 32|Female| 53000|2018-11-01|          102|5300.0|
|        005|    Jack Chan| 40|  Male| 60000|2013-04-01|          103|6000.0|
|        006|    Jill Wong| 32|Female| 52000|2018-07-01|          103|5200.0|
|        009|      Tom Tan| 33|  Male| 58000|2016-06-01|          103|5800.0|
|        019|  Steven Chen| 36|  Male| 62000|2015-08-01|          103|6200.0|
|        001|     John Doe| 30|  Male| 50000|2015-01-01|          101|5000.0|
|        002|   Jane Smith| 25|Female| 45000|2016-02-15|        

In [50]:
# Python UDF with spark sql

spark.udf.register("bonus_sql_udf", bonus, "double")

from pyspark.sql.functions import expr
emp.withColumn("bonus", expr("bonus_sql_udf(salary)")).show()


25/01/12 13:18:44 WARN SimpleFunctionRegistry: The function bonus_sql_udf replaced a previously registered function.


+-----------+-------------+---+------+------+----------+-------------+------+
|employee_id|         name|age|gender|salary| hire_date|department_id| bonus|
+-----------+-------------+---+------+------+----------+-------------+------+
|        003|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|5500.0|
|        004|    Alice Lee| 28|Female| 48000|2017-09-30|          102|4800.0|
|        008|     Kate Kim| 29|Female| 51000|2019-10-01|          102|5100.0|
|        020|    Grace Kim| 32|Female| 53000|2018-11-01|          102|5300.0|
|        005|    Jack Chan| 40|  Male| 60000|2013-04-01|          103|6000.0|
|        006|    Jill Wong| 32|Female| 52000|2018-07-01|          103|5200.0|
|        009|      Tom Tan| 33|  Male| 58000|2016-06-01|          103|5800.0|
|        019|  Steven Chen| 36|  Male| 62000|2015-08-01|          103|6200.0|
|        001|     John Doe| 30|  Male| 50000|2015-01-01|          101|5000.0|
|        002|   Jane Smith| 25|Female| 45000|2016-02-15|        

In [52]:
# Info: UDF is very expensive operation as it involves serialization/de-serialization 
# involved in the process. and must avoid in production use cases 
# In case of neccessity, define the udf with scala, 
# register it in spark.sql and use it for downstream operations

In [53]:
spark.stop()