In [0]:
from pyspark.sql.functions import *

In [0]:
employees = [
    (1, None, "Tiger", None,30,"united states", "+1 123 456 7890", "123 45 6789"),
    (2, "Henry", "Ford", 1250.0,None, "india", "+91 234 567 8901", "456 78 9123"),
    (3, "Nick", "Junior",500.0,"", "united kingdom", "+44 111 111 111", "222 33 4444"),
    (4, "Bill", "Gomes", 1500.0,150, "australia", "+61 987 654 3210", "789 12 6113"),
]

In [0]:
empdf = spark.createDataFrame(employees, schema = """employee_id INT, first_name STRING,
                              last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                              phone_number STRING, ssn STRING                              
                               """)

In [0]:
empdf.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
help(coalesce)

Help on function coalesce in module pyspark.sql.functions:

coalesce(*cols: 'ColumnOrName') -> pyspark.sql.column.Column
    Returns the first column that is not null.
    
    .. versionadded:: 1.4.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    cols : :class:`~pyspark.sql.Column` or str
        list of columns to work on.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        value of the first column that is not null.
    
    Examples
    --------
    >>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
    >>> cDf.show()
    +----+----+
    |   a|   b|
    +----+----+
    |NULL|NULL|
    |   1|NULL|
    |NULL|   2|
    +----+----+
    
    >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
    +--------------+
    |coalesce(a, b)|
    +--------------+
    |          NULL|
    |             1|
    |             2|
    +--------------+
    
    >>> cDf.select('*', coalesce(cDf["a

In [0]:
# Fails because 0 is not passed as column object. But using lit it will look for a column with name 0
empdf. \
    withColumn('bonus1', coalesce('bonus', 0)). \
    show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkTypeError[0m                          Traceback (most recent call last)
File [0;32m<command-1590524349081141>, line 3[0m
[1;32m      1[0m [38;5;66;03m# Fails because 0 is not passed as column object. But using lit it will look for a column with name 0[39;00m
[1;32m      2[0m empdf[38;5;241m.[39m \
[0;32m----> 3[0m     withColumn([38;5;124m'[39m[38;5;124mbonus1[39m[38;5;124m'[39m, [43mcoalesce[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mbonus[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;241;43m0[39;49m[43m)[49m)[38;5;241m.[39m \
[1;32m      4[0m     show()

File [0;32m/databricks/spark/python/pyspark/sql/utils.py:162[0m, in [0;36mtry_remote_functions.<locals>.wrapped[0;34m(*args, **kwargs)[0m
[1;32m    160[0m     [38;5;28;01mreturn[39;00m [38;5;28mgetattr[39m(functions, f[38;5;241m.[39m[38;5;18m__name__[39m)([38;5;241m*[39

In [0]:
empdf. \
    withColumn('bonus1', coalesce('bonus', lit(0))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    30|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|      |
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|   150|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [0]:
empdf. \
    withColumn('bonus1', col('bonus').cast('int')). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    30|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|  NULL|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|  NULL|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|   150|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [0]:
empdf. \
    withColumn('bonus1', coalesce(col('bonus').cast('int'), lit(0))). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    30|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|     0|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|   150|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [0]:
empdf. \
    withColumn('bonus1', expr("nvl(bonus,0)")). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    30|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|      |
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|   150|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [0]:
empdf. \
    withColumn('bonus1', expr("nvl(nullif(bonus,''),0)")). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|bonus1|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    30|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|     0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|     0|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|   150|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+------+



In [0]:
empdf. \
    withColumn('payment', col('salary') + ( col('salary') * coalesce(col('bonus'), lit(0)) / 100 )). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|payment|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|   NULL|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123| 1250.0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|   NULL|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113| 3750.0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+



In [0]:
empdf. \
    withColumn('payment', col('salary') + ( col('salary') * coalesce(col('bonus').cast('int'), lit(0)) / 100 )). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|payment|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|   NULL|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123| 1250.0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|  500.0|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113| 3750.0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+



In [0]:
empdf. \
    withColumn('payment', coalesce(col('salary'),lit(0)) + ( coalesce(col('salary'),lit(0)) * coalesce(col('bonus').cast('int'), lit(0)) / 100 )). \
    show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|payment|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+
|          1|      NULL|    Tiger|  NULL|   30| united states| +1 123 456 7890|123 45 6789|    0.0|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123| 1250.0|
|          3|      Nick|   Junior| 500.0|     |united kingdom| +44 111 111 111|222 33 4444|  500.0|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113| 3750.0|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+-------+



In [0]:
employees = [
    (1, "Scott", None, 1000.0,30,"united states", "+1 123 456 7890", "123 45 6789"),
    (2, "Henry", "Ford", 1250.0, None, "india", "+91 234 567 8901", "456 78 9123"),
    (3, "Nick", None, None,"", "united kingdom", "+44 111 111 111", "222 33 4444"),
    (4, "Bill", "Gomes", 1500.0,150, "australia", "+61 987 654 3210", "789 12 6113"),
]

In [0]:
empdf = spark.createDataFrame(employees, schema = """employee_id INT, first_name STRING,
                              last_name STRING, salary FLOAT, bonus STRING, nationality STRING,
                              phone_number STRING, ssn STRING                              
                               """)

In [0]:
empdf.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|     NULL|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|     NULL|  NULL|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
help(empdf.na)

Help on DataFrameNaFunctions in module pyspark.sql.dataframe object:

class DataFrameNaFunctions(builtins.object)
 |  DataFrameNaFunctions(df: pyspark.sql.dataframe.DataFrame)
 |  
 |  Functionality for working with missing data in :class:`DataFrame`.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Supports Spark Connect.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, df: pyspark.sql.dataframe.DataFrame)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  drop(self, how: str = 'any', thresh: Optional[int] = None, subset: Union[str, Tuple[str, ...], List[str], NoneType] = None) -> pyspark.sql.dataframe.DataFrame
 |      Returns a new :class:`DataFrame` omitting rows with null values.
 |      :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
 |      
 |      .. versionadded:: 1.3.1
 |      
 |      .. versionchanged:: 3.4.0
 |          Supports Spark Connect.
 |      
 |      Parameter

In [0]:
help(empdf.na.fill)

Help on method fill in module pyspark.sql.dataframe:

fill(value: Union[ForwardRef('LiteralType'), Dict[str, ForwardRef('LiteralType')]], subset: Optional[List[str]] = None) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.dataframe.DataFrameNaFunctions instance
    Replace null values, alias for ``na.fill()``.
    :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
    
    .. versionadded:: 1.3.1
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    value : int, float, string, bool or dict
        Value to replace null values with.
        If the value is a dict, then `subset` is ignored and `value` must be a mapping
        from column name (string) to replacement value. The replacement value must be
        an int, float, boolean, or string.
    subset : str, tuple or list, optional
        optional list of column names to consider.
        Columns specified in subset that do not have 

In [0]:
help(empdf.fillna)

Help on method fillna in module pyspark.sql.dataframe:

fillna(value: Union[ForwardRef('LiteralType'), Dict[str, ForwardRef('LiteralType')]], subset: Union[str, Tuple[str, ...], List[str], NoneType] = None) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Replace null values, alias for ``na.fill()``.
    :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.
    
    .. versionadded:: 1.3.1
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    value : int, float, string, bool or dict
        Value to replace null values with.
        If the value is a dict, then `subset` is ignored and `value` must be a mapping
        from column name (string) to replacement value. The replacement value must be
        an int, float, boolean, or string.
    subset : str, tuple or list, optional
        optional list of column names to consider.
        Columns specified in subset that do not hav

In [0]:
empdf.show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|     NULL|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|     NULL|  NULL|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
empdf.fillna(0.0).show() # since 0.0 is float it fills with that value only in float columns

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|     NULL|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|     NULL|   0.0|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
empdf.fillna("na").show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|       na|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|   na|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|       na|  NULL|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
empdf.fillna(0.0).fillna("na").show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|       na|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|   na|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|       na|   0.0|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+



In [0]:
empdf.fillna(0.0, 'salary').fillna("na", 'last_name').show()

+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|bonus|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+
|          1|     Scott|       na|1000.0|   30| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0| NULL|         india|+91 234 567 8901|456 78 9123|
|          3|      Nick|       na|   0.0|     |united kingdom| +44 111 111 111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|  150|     australia|+61 987 654 3210|789 12 6113|
+-----------+----------+---------+------+-----+--------------+----------------+-----------+

