In [0]:
# PySpark UDF is a User Defined Function that is used to create a reusable function in Spark. Once UDF created, that can be re-used on multiple DataFrames and SQL (after registering). The default type of the udf() is StringType.
# for more knowledge
https://www.geeksforgeeks.org/how-to-write-spark-udf-user-defined-functions-in-python/

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit,udf
from pyspark.sql.types import IntegerType, DoubleType
spark = SparkSession.builder.appName("UDF").getOrCreate()

In [0]:
df=spark.read.options(header='True', inferSchema='True').csv('/FileStore/tables/file_csv.txt')
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [0]:
#UDF -> User Defined Functions(Custom Requirements)
#Simple UDF
#increase salary for each employee by state
#if state is NY -> salary by 10% and bonus by 5%
#if state is CA -> salary by 12% and bonus by 3%
def priyanshu(state,salary,bonus):
 demo = 0 #sum
 if state == "NY":
     demo = salary * 0.10
     demo += bonus * 0.05
 elif state == "CA":
     demo = salary * 0.12
     demo += bonus * 0.03
 return demo
 
 

In [0]:
print(priyanshu("CA",1000,100))

123.0


In [0]:
# so basically normal withcol perform operation on whole col while through udf we can apply condition 
# here it is taking two input first one is the format and second one is the return type
Udf = udf(lambda x,y,z: priyanshu(x,y,z), DoubleType())

In [0]:
# here we are creating new column increment and passing value from csv
df.withColumn("increment",Udf(df.state,df.salary,df.bonus)).show()

+-------------+----------+-----+------+---+-----+---------+
|employee_name|department|state|salary|age|bonus|increment|
+-------------+----------+-----+------+---+-----+---------+
|        James|     Sales|   NY| 90000| 34|10000|   9500.0|
|      Michael|     Sales|   NY| 86000| 56|20000|   9600.0|
|       Robert|     Sales|   CA| 81000| 30|23000|  10410.0|
|        Maria|   Finance|   CA| 90000| 24|23000|  11490.0|
|        Raman|   Finance|   CA| 99000| 40|24000|  12600.0|
|        Scott|   Finance|   NY| 83000| 36|19000|   9250.0|
|          Jen|   Finance|   NY| 79000| 53|15000|   8650.0|
|         Jeff| Marketing|   CA| 80000| 25|18000|  10140.0|
|        Kumar| Marketing|   NY| 91000| 50|21000|  10150.0|
+-------------+----------+-----+------+---+-----+---------+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit,udf
from pyspark.sql.types import IntegerType, DoubleType,ArrayType
spark = SparkSession.builder.appName("UDF").getOrCreate()

In [0]:
#UDF -> User Defined Functions(Custom Requirements)
#Simple UDF
#increase salary for each employee by state
#if state is NY -> salary by 10% and bonus by 5%
#if state is CA -> salary by 12% and bonus by 3%
def priyanshu(state,salary,bonus):
 demo1 = 0 #sum
 demo2 = 0
 if state == "NY":
     demo1 = salary * 0.10
     demo2 = bonus * 0.05
 elif state == "CA":
     demo1 = salary * 0.12
     demo2 = bonus * 0.03
 return demo1,demo2
 
 

In [0]:
Udf = udf(lambda x,y,z: priyanshu(x,y,z), ArrayType(DoubleType()))

In [0]:
# here we are creating new column increment and passing value from csv
demo = df.withColumn("increment",Udf(df.state,df.salary,df.bonus))

In [0]:
# cache
demo1 = demo
print(type(demo1))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
# cache
demo2 = demo1
print(type(demo2))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
# cache
demo3 = demo2
print(type(demo3))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
demo1.cache()
demo2.cache()
demo3.cache()
demo3.persist()

Out[30]: DataFrame[employee_name: string, department: string, state: string, salary: int, age: int, bonus: int, increment: array<double>]

In [0]:
# By default transformation is lazy in nature means it's always start from very begining even if you run them  multiple time it wont save the data like data->demo1->demo2->demo3 so even if you already run demo1 than also demo2 will run demo1 and demo3 will run demo2 and demo1 and fetch the data. 
# So to overcome we save the data in either Ram(Cache) or Internal hard drive like (Persist)
# Using cache() and persist() methods, Spark provides an optimization mechanism to store the intermediate computation of an RDD, DataFrame, and Dataset so they can be reused in subsequent actions(reusing the RDD, Dataframe, and Dataset computation result’s).

# Both caching and persisting are used to save the Spark RDD, Dataframe, and Dataset’s. But, the difference is, RDD cache() method default saves it to memory (MEMORY_ONLY) whereas persist() method is used to store it to the user-defined storage level.