In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os

In [34]:
spark = SparkSession.builder.appName("Basics").master("local[*]").getOrCreate()

In [35]:
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(17 ,'Raman',55000, 16),
(17 ,'Raman',59000, 16)]


In [36]:
schema =['id','name','salary','manager_id']

In [37]:
df = spark.createDataFrame(data,schema)

In [38]:
df.withColumn("salary",when(col("salary") > 50000, col("salary") * 1.1).otherwise(col("salary"))).show()

+---+------+-----------------+----------+
| id|  name|           salary|manager_id|
+---+------+-----------------+----------+
| 10|  Anil|          50000.0|        18|
| 11| Vikas|          82500.0|        16|
| 12| Nisha|          40000.0|        18|
| 13| Nidhi|          66000.0|        17|
| 14| Priya|          88000.0|        18|
| 15| Mohit|          45000.0|        18|
| 16|Rajesh|99000.00000000001|        10|
| 17| Raman|60500.00000000001|        16|
| 18|   Sam|          71500.0|        17|
| 17| Raman|60500.00000000001|        16|
| 17| Raman|64900.00000000001|        16|
+---+------+-----------------+----------+



In [39]:
df.withColumn("salary_status",when(col("salary") > 50000,"yes")
                       .when(col("salary") < 50000,"no")
                          .otherwise("no value")).show()

+---+------+------+----------+-------------+
| id|  name|salary|manager_id|salary_status|
+---+------+------+----------+-------------+
| 10|  Anil| 50000|        18|     no value|
| 11| Vikas| 75000|        16|          yes|
| 12| Nisha| 40000|        18|           no|
| 13| Nidhi| 60000|        17|          yes|
| 14| Priya| 80000|        18|          yes|
| 15| Mohit| 45000|        18|           no|
| 16|Rajesh| 90000|        10|          yes|
| 17| Raman| 55000|        16|          yes|
| 18|   Sam| 65000|        17|          yes|
| 17| Raman| 55000|        16|          yes|
| 17| Raman| 59000|        16|          yes|
+---+------+------+----------+-------------+



In [40]:
df.withColumn("manager_id",when(col("manager_id").isNull(),lit("no manager"))
              .otherwise(col("manager_id")))\
  .withColumn("salary_status",when(col("salary") > 50000,"yes")
                       .when(col("salary") < 50000,"no")
                          .otherwise("no value")).show()

+---+------+------+----------+-------------+
| id|  name|salary|manager_id|salary_status|
+---+------+------+----------+-------------+
| 10|  Anil| 50000|        18|     no value|
| 11| Vikas| 75000|        16|          yes|
| 12| Nisha| 40000|        18|           no|
| 13| Nidhi| 60000|        17|          yes|
| 14| Priya| 80000|        18|          yes|
| 15| Mohit| 45000|        18|           no|
| 16|Rajesh| 90000|        10|          yes|
| 17| Raman| 55000|        16|          yes|
| 18|   Sam| 65000|        17|          yes|
| 17| Raman| 55000|        16|          yes|
| 17| Raman| 59000|        16|          yes|
+---+------+------+----------+-------------+



In [41]:
df.withColumn("comments",when(col("salary") > 50000,"high salary")
                       .when(col("salary") < 50000,"low salary")
                          .otherwise("average salary")).show()

+---+------+------+----------+--------------+
| id|  name|salary|manager_id|      comments|
+---+------+------+----------+--------------+
| 10|  Anil| 50000|        18|average salary|
| 11| Vikas| 75000|        16|   high salary|
| 12| Nisha| 40000|        18|    low salary|
| 13| Nidhi| 60000|        17|   high salary|
| 14| Priya| 80000|        18|   high salary|
| 15| Mohit| 45000|        18|    low salary|
| 16|Rajesh| 90000|        10|   high salary|
| 17| Raman| 55000|        16|   high salary|
| 18|   Sam| 65000|        17|   high salary|
| 17| Raman| 55000|        16|   high salary|
| 17| Raman| 59000|        16|   high salary|
+---+------+------+----------+--------------+



In [44]:
df.withColumn("comments",
    when((col("salary") > 50000) & (col("salary") < 60000), "mid salary")
    .otherwise("other")
).show()

+---+------+------+----------+----------+
| id|  name|salary|manager_id|  comments|
+---+------+------+----------+----------+
| 10|  Anil| 50000|        18|     other|
| 11| Vikas| 75000|        16|     other|
| 12| Nisha| 40000|        18|     other|
| 13| Nidhi| 60000|        17|     other|
| 14| Priya| 80000|        18|     other|
| 15| Mohit| 45000|        18|     other|
| 16|Rajesh| 90000|        10|     other|
| 17| Raman| 55000|        16|mid salary|
| 18|   Sam| 65000|        17|     other|
| 17| Raman| 55000|        16|mid salary|
| 17| Raman| 59000|        16|mid salary|
+---+------+------+----------+----------+



In [None]:
data1=[(19 ,'Sohan',50000, 18),
(20 ,'Sima',75000,  17)]

In [8]:
schema =['id','name','salary','manager_id']

In [9]:
df1 = spark.createDataFrame(data1,schema)

In [10]:
df1.show()

+---+-----+------+----------+
| id| name|salary|manager_id|
+---+-----+------+----------+
| 19|Sohan| 50000|        18|
| 20| Sima| 75000|        17|
+---+-----+------+----------+



In [11]:
df.show()

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
| 17| Raman| 55000|        16|
| 17| Raman| 59000|        16|
+---+------+------+----------+



In [12]:
df.union(df1).show()

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
| 17| Raman| 55000|        16|
| 17| Raman| 59000|        16|
| 19| Sohan| 50000|        18|
| 20|  Sima| 75000|        17|
+---+------+------+----------+



In [13]:
df.union(df1).count()

13

In [14]:
df.unionAll(df1).show() 

+---+------+------+----------+
| id|  name|salary|manager_id|
+---+------+------+----------+
| 10|  Anil| 50000|        18|
| 11| Vikas| 75000|        16|
| 12| Nisha| 40000|        18|
| 13| Nidhi| 60000|        17|
| 14| Priya| 80000|        18|
| 15| Mohit| 45000|        18|
| 16|Rajesh| 90000|        10|
| 17| Raman| 55000|        16|
| 18|   Sam| 65000|        17|
| 17| Raman| 55000|        16|
| 17| Raman| 59000|        16|
| 19| Sohan| 50000|        18|
| 20|  Sima| 75000|        17|
+---+------+------+----------+



In [15]:
df.union(df1).distinct().count()

12

In [16]:
df.unionAll(df1).count()

13

In [48]:
spark.stop()