In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName('Test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/19 07:34:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
employees = [(1, "Scott", "Tiger", 1000.0, 10,
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, None,
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, '',
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 10,
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [5]:
employeesDF = spark.createDataFrame(employees,
                                   schema = """id INT,first_name STRING,last_name STRING,
                                   salary FLOAT,bonus STRING,nationality STRING,
                                   phone_number STRING,ssn STRING""")
            

In [6]:
employeesDF.show(truncate = False)

                                                                                

+---+----------+---------+------+-----+--------------+----------------+-----------+
|id |first_name|last_name|salary|bonus|nationality   |phone_number    |ssn        |
+---+----------+---------+------+-----+--------------+----------------+-----------+
|1  |Scott     |Tiger    |1000.0|10   |united states |+1 123 456 7890 |123 45 6789|
|2  |Henry     |Ford     |1250.0|null |India         |+91 234 567 8901|456 78 9123|
|3  |Nick      |Junior   |750.0 |     |united KINGDOM|+44 111 111 1111|222 33 4444|
|4  |Bill      |Gomes    |1500.0|10   |AUSTRALIA     |+61 987 654 3210|789 12 6118|
+---+----------+---------+------+-----+--------------+----------------+-----------+



In [7]:
employeesDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- bonus: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- ssn: string (nullable = true)



In [8]:
employeesDF. \
    withColumn('bonus', coalesce('bonus', 0)). \
    show()

TypeError: Invalid argument, not a string or column: 0 of type <class 'int'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [9]:
employeesDF. \
    withColumn('bonus1', coalesce('bonus', lit(0))). \
    show()

+---+----------+---------+------+-----+--------------+----------------+-----------+------+
| id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+
|  1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|    10|
|  2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|     0|
|  3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|      |
|  4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|    10|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+



In [14]:
employeesDF. \
    withColumn('bonus1',col('bonus').cast('int')).show()

+---+----------+---------+------+-----+--------------+----------------+-----------+------+
| id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+
|  1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|    10|
|  2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|  null|
|  3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|  null|
|  4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|    10|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+



In [15]:
employeesDF. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'), lit(0))). \
    show()

+---+----------+---------+------+-----+--------------+----------------+-----------+------+
| id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+
|  1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789|    10|
|  2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123|     0|
|  3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|     0|
|  4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118|    10|
+---+----------+---------+------+-----+--------------+----------------+-----------+------+



In [16]:
employeesDF. \
    withColumn('payment', col('salary') + (col('salary') * coalesce(col('bonus').cast('int'), lit(0)) / 100)). \
    show()

+---+----------+---------+------+-----+--------------+----------------+-----------+-------+
| id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|payment|
+---+----------+---------+------+-----+--------------+----------------+-----------+-------+
|  1|     Scott|    Tiger|1000.0|   10| united states| +1 123 456 7890|123 45 6789| 1100.0|
|  2|     Henry|     Ford|1250.0| null|         India|+91 234 567 8901|456 78 9123| 1250.0|
|  3|      Nick|   Junior| 750.0|     |united KINGDOM|+44 111 111 1111|222 33 4444|  750.0|
|  4|      Bill|    Gomes|1500.0|   10|     AUSTRALIA|+61 987 654 3210|789 12 6118| 1650.0|
+---+----------+---------+------+-----+--------------+----------------+-----------+-------+

