In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('udf').master('local[*]').getOrCreate()

spark

In [4]:
data = [('Shreyas',100),('John',200),('Doe',300)]
schema_Str = "Name String, Salary Int"

df =spark.createDataFrame(data, schema_Str)

In [5]:
# creatinG UDF
def increment(salary):
    return salary*1.3

In [6]:
# register the udf
# to use the udf in pyspark code on dataframes we need to register that to the sparksession which lets it know that this object is a udf

from pyspark.sql.functions import udf

increment_udf = udf(increment)

In [7]:
# using the udf on dataframe
df_new = df.withColumn('Incremented_salary', increment_udf(df.Salary))
df_new.show()

+-------+------+------------------+
|   Name|Salary|Incremented_salary|
+-------+------+------------------+
|Shreyas|   100|             130.0|
|   John|   200|             260.0|
|    Doe|   300|             390.0|
+-------+------+------------------+



In [15]:
# What if we want to use this UDF on a sparksql table/view
# then we need to register this in a different way
# spark.udf.register(name, funcname, returnType=StringType())

spark.udf.register("increment_udf", increment, "double")

<function __main__.increment(salary)>

In [16]:
# we can now use this in spark sql
df.createOrReplaceTempView("df_view")

spark.sql("select *, increment_udf(Salary) from df_view").show()

+-------+------+---------------------+
|   Name|Salary|increment_udf(Salary)|
+-------+------+---------------------+
|Shreyas|   100|                130.0|
|   John|   200|                260.0|
|    Doe|   300|                390.0|
+-------+------+---------------------+

