In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit,udf
from pyspark.sql.types import IntegerType, DoubleType

spark = SparkSession.builder.appName('UDF').getOrCreate()
df = spark.read.options(header=True,inferSchema=True).csv('/FileStore/tables/OfficeData.csv')
df.printSchema()
df.show()


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- bonus: integer (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [0]:
def total_salary(salary,bonus):
    return salary+bonus

total_salary_UDF = udf(lambda x,y: total_salary(x,y),IntegerType())

df.withColumn('Total Salary',total_salary_UDF(col('salary'),col('bonus'))).show()

+-------------+----------+-----+------+---+-----+------------+
|employee_name|department|state|salary|age|bonus|Total Salary|
+-------------+----------+-----+------+---+-----+------------+
|        James|     Sales|   NY| 90000| 34|10000|      100000|
|      Michael|     Sales|   NY| 86000| 56|20000|      106000|
|       Robert|     Sales|   CA| 81000| 30|23000|      104000|
|        Maria|   Finance|   CA| 90000| 24|23000|      113000|
|        Raman|   Finance|   CA| 99000| 40|24000|      123000|
|        Scott|   Finance|   NY| 83000| 36|19000|      102000|
|          Jen|   Finance|   NY| 79000| 53|15000|       94000|
|         Jeff| Marketing|   CA| 80000| 25|18000|       98000|
|        Kumar| Marketing|   NY| 91000| 50|21000|      112000|
+-------------+----------+-----+------+---+-----+------------+



In [0]:
# Use OfficeData.csv
# read this in Df
# create a new column increment and provide the increment to the employee on the following criteria
#     if the employee is in the NY state, his increment would be 10% of salary plus 5% of bonus
#     if the employee is in CA state, his increment would be 12% of hte salary and 

In [0]:
df = spark.read.options(header=True,inferSchema=True).csv('/FileStore/tables/OfficeData.csv')
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [0]:
def salary_Increment(state, salary, bonus):
    sum = 0
    if state == 'NY':
        sum = salary * 0.10
        sum += bonus * 0.5
    if state == 'CA':
        sum = salary * 0.12
        sum += bonus * 0.3
    return sum

total_salary = udf(lambda x,y,z: salary_Increment(x,y,z), DoubleType())

df.withColumn('increment salary', total_salary(col('state'),col('salary'),col('bonus'))).show()

+-------------+----------+-----+------+---+-----+----------------+
|employee_name|department|state|salary|age|bonus|increment salary|
+-------------+----------+-----+------+---+-----+----------------+
|        James|     Sales|   NY| 90000| 34|10000|         14000.0|
|      Michael|     Sales|   NY| 86000| 56|20000|         18600.0|
|       Robert|     Sales|   CA| 81000| 30|23000|         16620.0|
|        Maria|   Finance|   CA| 90000| 24|23000|         17700.0|
|        Raman|   Finance|   CA| 99000| 40|24000|         19080.0|
|        Scott|   Finance|   NY| 83000| 36|19000|         17800.0|
|          Jen|   Finance|   NY| 79000| 53|15000|         15400.0|
|         Jeff| Marketing|   CA| 80000| 25|18000|         15000.0|
|        Kumar| Marketing|   NY| 91000| 50|21000|         19600.0|
+-------------+----------+-----+------+---+-----+----------------+

