In [2]:
# Testing pyspark Intallation
import findspark
findspark.init('C:\Spark')
findspark.find()

import pyspark
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

spark

In [4]:
# Read CSV File
csv_file = 'data/1.csv'

df = spark.read.option("header",True).csv(csv_file)

df.show(5)

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|  null|  null|
+-------+------+------+



In [5]:
from pyspark.sql import functions as F

df.select(df.name, F.when(df.salary > 400000, 1).otherwise(0)).show()

+-------+---------------------------------------------+
|   name|CASE WHEN (salary > 400000) THEN 1 ELSE 0 END|
+-------+---------------------------------------------+
|  James|                                            0|
|Michael|                                            0|
| Robert|                                            0|
|  Maria|                                            1|
|    Jen|                                            0|
+-------+---------------------------------------------+



In [7]:
# Read CSV File
csv_file = 'data/2.csv'

df = spark.read.option("header",True).csv(csv_file)

df.show(5)

+------+------+---+----------+------+
|  Name|RollNo|Age|Percentage|Gender|
+------+------+---+----------+------+
|Candis|    27| 20|      89.2|     F|
|Robert|    60| 21|      85.1| Other|
| Nanki|    46| 21|      80.1|     F|
|  Deck|    30| 19|      90.7|     M|
+------+------+---+----------+------+



In [8]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- RollNo: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Percentage: string (nullable = true)
 |-- Gender: string (nullable = true)



In [None]:
'''
withcolumn : Accepts two argument
    First argument: Name of the existing column or new column
    Second argument: we write the when otherwise condition.

In [9]:
# Replace 'F' to 'Female' 'M' to 'Male'
from pyspark.sql.functions import col,when

df_when  = df.withColumn('Gender',when(col("Gender") == "M","Male").when(col("Gender") == "F","Female").otherwise("Other"))

df_when.show()

+------+------+---+----------+------+
|  Name|RollNo|Age|Percentage|Gender|
+------+------+---+----------+------+
|Candis|    27| 20|      89.2|Female|
|Robert|    60| 21|      85.1| Other|
| Nanki|    46| 21|      80.1|Female|
|  Deck|    30| 19|      90.7|  Male|
+------+------+---+----------+------+



In [10]:
# New Column : Create New column on existing Data Frame

df_when2  = df.select(col("*"),when(col("Gender") == "M","Male")
                      .when(col("Gender") == "F","Female")
                      .otherwise("Unknow").alias("New_gender"))

df_when2.show()

+------+------+---+----------+------+----------+
|  Name|RollNo|Age|Percentage|Gender|New_gender|
+------+------+---+----------+------+----------+
|Candis|    27| 20|      89.2|     F|    Female|
|Robert|    60| 21|      85.1| Other|    Unknow|
| Nanki|    46| 21|      80.1|     F|    Female|
|  Deck|    30| 19|      90.7|     M|      Male|
+------+------+---+----------+------+----------+



In [11]:
# case when

# Question: Create a column “Performance” and find it out on the basis of percentage?

from pyspark.sql.functions import expr

df_case = df.withColumn("Performance", expr("case when Percentage>88.0 then 'Excellent' " 
                                            + "when Percentage<83.0 then 'Average' " 
                                            + "else 'Great' end"))

#View Dataframe
df_case.show()

+------+------+---+----------+------+-----------+
|  Name|RollNo|Age|Percentage|Gender|Performance|
+------+------+---+----------+------+-----------+
|Candis|    27| 20|      89.2|     F|  Excellent|
|Robert|    60| 21|      85.1| Other|      Great|
| Nanki|    46| 21|      80.1|     F|    Average|
|  Deck|    30| 19|      90.7|     M|  Excellent|
+------+------+---+----------+------+-----------+



In [12]:
# Using & and | operator

data=df.withColumn("Results", when((df["Age"]>19) & (df["Percentage"]>87.0) , "Pass"). 
                   when((df["Age"]==19) | (df["Percentage"]==87.0) , "Fail").otherwise("Unknown"))

data.show()

+------+------+---+----------+------+-------+
|  Name|RollNo|Age|Percentage|Gender|Results|
+------+------+---+----------+------+-------+
|Candis|    27| 20|      89.2|     F|   Pass|
|Robert|    60| 21|      85.1| Other|Unknown|
| Nanki|    46| 21|      80.1|     F|Unknown|
|  Deck|    30| 19|      90.7|     M|   Fail|
+------+------+---+----------+------+-------+

