### Create DataFrame

In [0]:
student_data = [
    ('Srishti', 'English', 85, 'P', 92),
    ('Aish', 'Science', 78, 'P', 88),
    ('Jenny', 'English', None, 'NA', 60),
    ('Rohan', 'Math', 55, 'P', 75),
    ('Aarvi', 'Science', 32, 'F', 58),
    ('Sid', 'Math', 90, 'P', 95)
]
student_schema = ['Name', 'Subject', 'Marks', 'Status', 'Attendance']

df= spark.createDataFrame(student_data, student_schema)
display(df)

Name,Subject,Marks,Status,Attendance
Srishti,English,85.0,P,92
Aish,Science,78.0,P,88
Jenny,English,,,60
Rohan,Math,55.0,P,75
Aarvi,Science,32.0,F,58
Sid,Math,90.0,P,95


### Update the Existing Column

In [0]:
from pyspark.sql.functions import when

df1 = df.withColumn("Status", when(df.Marks>=50, "Pass")
                                .when(df.Marks<50, "Fail")
                                .otherwise("Absent"))
display(df1)

Name,Subject,Marks,Status,Attendance
Srishti,English,85.0,Pass,92
Aish,Science,78.0,Pass,88
Jenny,English,,Absent,60
Rohan,Math,55.0,Pass,75
Aarvi,Science,32.0,Fail,58
Sid,Math,90.0,Pass,95


### Create a New Column

In [0]:
df2 = df.withColumn("NewStatus", when(df.Marks>=50, "Pass")
                                .when(df.Marks<50, "Fail")
                                .otherwise("Absent"))
display(df2)

Name,Subject,Marks,Status,Attendance,NewStatus
Srishti,English,85.0,P,92,Pass
Aish,Science,78.0,P,88,Pass
Jenny,English,,,60,Absent
Rohan,Math,55.0,P,75,Pass
Aarvi,Science,32.0,F,58,Fail
Sid,Math,90.0,P,95,Pass


### Another Syntax Method using 'expr'

In [0]:
from pyspark.sql.functions import expr

In [0]:
df3 = df.withColumn("NewStatus", expr("CASE WHEN Marks>=50 THEN 'Pass'" +
                                "WHEN Marks<50 THEN 'Fail'" +
                                "ELSE 'Absent' END"))
display(df3)

Name,Subject,Marks,Status,Attendance,NewStatus
Srishti,English,85.0,P,92,Pass
Aish,Science,78.0,P,88,Pass
Jenny,English,,,60,Absent
Rohan,Math,55.0,P,75,Pass
Aarvi,Science,32.0,F,58,Fail
Sid,Math,90.0,P,95,Pass


### Multi Conditions using AND and OR operator  # &, |

In [0]:
df4 = df.withColumn("Grade", when((df.Marks>=80) & (df.Attendance>=80), "Distinction")
                                .when((df.Marks>=50) & (df.Attendance>=50), "Good")
                                .otherwise("Average"))
display(df4)

Name,Subject,Marks,Status,Attendance,Grade
Srishti,English,85.0,P,92,Distinction
Aish,Science,78.0,P,88,Good
Jenny,English,,,60,Average
Rohan,Math,55.0,P,75,Good
Aarvi,Science,32.0,F,58,Average
Sid,Math,90.0,P,95,Distinction


In [0]:
display(df.withColumn("Grade", when((df.Marks>=80) | (df.Attendance>=80), "Distinction")
                                .when((df.Marks>=50) & (df.Attendance>=50), "Good")
                                .otherwise("Average")))

Name,Subject,Marks,Status,Attendance,Grade
Srishti,English,85.0,P,92,Distinction
Aish,Science,78.0,P,88,Distinction
Jenny,English,,,60,Average
Rohan,Math,55.0,P,75,Good
Aarvi,Science,32.0,F,58,Average
Sid,Math,90.0,P,95,Distinction
