### Spark All Functions
Reference: https://github.com/DeltaOptimist/Spark-SQL-All-Functions-PySpark/blob/main/Spark_SQL_Functions_all_doc.ipynb

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import datetime

In [None]:
spark = SparkSession.builder.appName("SQL Functions").getOrCreate()
spark

In [None]:
data = [(1, "John Doe","2024-08-01",23,1000.50),
        (2, "Jane Smith","2024-08-02",34, 2000.75),
        (3, "Jake White","2024-08-03",18,3000.10),
        (4, "Jill Black","2024-08-04",45,4000.25),
        (5, "James Brown","2024-08-05",29,1500.30),
        (6, "James Bond","2024-08-06",31,2500.45)]

In [None]:
columns = ["id","name","dob","age","salary"]
df = spark.createDataFrame(data, columns)

In [None]:
df.show()

+---+-----------+----------+---+-------+
| id|       name|       dob|age| salary|
+---+-----------+----------+---+-------+
|  1|   John Doe|2024-08-01| 23| 1000.5|
|  2| Jane Smith|2024-08-02| 34|2000.75|
|  3| Jake White|2024-08-03| 18| 3000.1|
|  4| Jill Black|2024-08-04| 45|4000.25|
|  5|James Brown|2024-08-05| 29| 1500.3|
|  6| James Bond|2024-08-06| 31|2500.45|
+---+-----------+----------+---+-------+



In [None]:
#1. Select the name column
from pyspark.sql.functions import col
df.select(col("name")).show()

+-----------+
|       name|
+-----------+
|   John Doe|
| Jane Smith|
| Jake White|
| Jill Black|
|James Brown|
| James Bond|
+-----------+



In [None]:
#2. Add a new column with a literal value
df_country = df.withColumn("country", lit("USA"))
df_country.show()

+---+-----------+----------+---+-------+-------+
| id|       name|       dob|age| salary|country|
+---+-----------+----------+---+-------+-------+
|  1|   John Doe|2024-08-01| 23| 1000.5|    USA|
|  2| Jane Smith|2024-08-02| 34|2000.75|    USA|
|  3| Jake White|2024-08-03| 18| 3000.1|    USA|
|  4| Jill Black|2024-08-04| 45|4000.25|    USA|
|  5|James Brown|2024-08-05| 29| 1500.3|    USA|
|  6| James Bond|2024-08-06| 31|2500.45|    USA|
+---+-----------+----------+---+-------+-------+



In [None]:
df.show()

+---+-----------+----------+---+-------+
| id|       name|       dob|age| salary|
+---+-----------+----------+---+-------+
|  1|   John Doe|2024-08-01| 23| 1000.5|
|  2| Jane Smith|2024-08-02| 34|2000.75|
|  3| Jake White|2024-08-03| 18| 3000.1|
|  4| Jill Black|2024-08-04| 45|4000.25|
|  5|James Brown|2024-08-05| 29| 1500.3|
|  6| James Bond|2024-08-06| 31|2500.45|
+---+-----------+----------+---+-------+



In [None]:
#3. Add 5 to the age column for every records
from pyspark.sql.functions import expr
df_age = df.withColumn("added_age", expr("age+5"))
df_age.show()

+---+-----------+----------+---+-------+---------+
| id|       name|       dob|age| salary|added_age|
+---+-----------+----------+---+-------+---------+
|  1|   John Doe|2024-08-01| 23| 1000.5|       28|
|  2| Jane Smith|2024-08-02| 34|2000.75|       39|
|  3| Jake White|2024-08-03| 18| 3000.1|       23|
|  4| Jill Black|2024-08-04| 45|4000.25|       50|
|  5|James Brown|2024-08-05| 29| 1500.3|       34|
|  6| James Bond|2024-08-06| 31|2500.45|       36|
+---+-----------+----------+---+-------+---------+



In [None]:
#4. Classifying people as per age
from pyspark.sql.functions import when
df_class = df.withColumn("age category", when(col("age")<=18,"Minor").when(col("age")<=30,"Adult").otherwise("Middle Age"))
df_class.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|age category|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|       Adult|
|  2| Jane Smith|2024-08-02| 34|2000.75|  Middle Age|
|  3| Jake White|2024-08-03| 18| 3000.1|       Minor|
|  4| Jill Black|2024-08-04| 45|4000.25|  Middle Age|
|  5|James Brown|2024-08-05| 29| 1500.3|       Adult|
|  6| James Bond|2024-08-06| 31|2500.45|  Middle Age|
+---+-----------+----------+---+-------+------------+



In [None]:
#5. Concat function
from pyspark.sql.functions import concat
df_name = df.withColumn("Salutation", concat(lit("Respected "), col("name")))
df_name.show()

+---+-----------+----------+---+-------+--------------------+
| id|       name|       dob|age| salary|          Salutation|
+---+-----------+----------+---+-------+--------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|  Respected John Doe|
|  2| Jane Smith|2024-08-02| 34|2000.75|Respected Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1|Respected Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25|Respected Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3|Respected James B...|
|  6| James Bond|2024-08-06| 31|2500.45|Respected James Bond|
+---+-----------+----------+---+-------+--------------------+



In [None]:
#6. Substring
from pyspark.sql.functions import substring
df_subs = df.withColumn("First_Three_Chars", substring(col("name"),1,3))
df_subs.show()

+---+-----------+----------+---+-------+-----------------+
| id|       name|       dob|age| salary|First_Three_Chars|
+---+-----------+----------+---+-------+-----------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|              Joh|
|  2| Jane Smith|2024-08-02| 34|2000.75|              Jan|
|  3| Jake White|2024-08-03| 18| 3000.1|              Jak|
|  4| Jill Black|2024-08-04| 45|4000.25|              Jil|
|  5|James Brown|2024-08-05| 29| 1500.3|              Jam|
|  6| James Bond|2024-08-06| 31|2500.45|              Jam|
+---+-----------+----------+---+-------+-----------------+



In [None]:
#7. Split
from pyspark.sql.functions import split
df_split = df.withColumn("name_array", split(col("name")," "))
df_split.show()

+---+-----------+----------+---+-------+--------------+
| id|       name|       dob|age| salary|    name_array|
+---+-----------+----------+---+-------+--------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|   [John, Doe]|
|  2| Jane Smith|2024-08-02| 34|2000.75| [Jane, Smith]|
|  3| Jake White|2024-08-03| 18| 3000.1| [Jake, White]|
|  4| Jill Black|2024-08-04| 45|4000.25| [Jill, Black]|
|  5|James Brown|2024-08-05| 29| 1500.3|[James, Brown]|
|  6| James Bond|2024-08-06| 31|2500.45| [James, Bond]|
+---+-----------+----------+---+-------+--------------+



In [None]:
#8. Replace
from pyspark.sql.functions import regexp_replace
df_replace = df.withColumn("replaced_name", regexp_replace(col("name"),"John","Jon"))
df_replace.show()

+---+-----------+----------+---+-------+-------------+
| id|       name|       dob|age| salary|replaced_name|
+---+-----------+----------+---+-------+-------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|      Jon Doe|
|  2| Jane Smith|2024-08-02| 34|2000.75|   Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1|   Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25|   Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3|  James Brown|
|  6| James Bond|2024-08-06| 31|2500.45|   James Bond|
+---+-----------+----------+---+-------+-------------+



In [None]:
#9. Count number of records in the dataframe
from pyspark.sql.functions import count
df_count = df.agg(count("*").alias("total_count"))
df_count.show()

+-----------+
|total_count|
+-----------+
|          6|
+-----------+



In [None]:
#10. Sum function
from pyspark.sql.functions import sum
df_sum = df.agg(sum("salary").alias("total_salary"))
df_sum.show()

+------------+
|total_salary|
+------------+
|    14002.35|
+------------+



In [None]:
#11. Calculate the Average Age
from pyspark.sql.functions import avg
df_avg = df.agg(avg("age").alias("Avg_age"))
df_avg.show()

+-------+
|Avg_age|
+-------+
|   30.0|
+-------+



In [None]:
#12. Using Max
from pyspark.sql.functions import max
df_max = df.agg(max("salary").alias("Max Salary"))
df_max.show()

+----------+
|Max Salary|
+----------+
|   4000.25|
+----------+



In [None]:
#13. Min function
from pyspark.sql.functions import min
df_min = df.agg(min("salary").alias("Min Salary"))
df_min.show()

+----------+
|Min Salary|
+----------+
|    1000.5|
+----------+



In [None]:
#14. Round function
from pyspark.sql.functions import round
df_round = df.withColumn("rounded_salary", round(col("salary"),0))
df_round.show()

+---+-----------+----------+---+-------+--------------+
| id|       name|       dob|age| salary|rounded_salary|
+---+-----------+----------+---+-------+--------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|        1001.0|
|  2| Jane Smith|2024-08-02| 34|2000.75|        2001.0|
|  3| Jake White|2024-08-03| 18| 3000.1|        3000.0|
|  4| Jill Black|2024-08-04| 45|4000.25|        4000.0|
|  5|James Brown|2024-08-05| 29| 1500.3|        1500.0|
|  6| James Bond|2024-08-06| 31|2500.45|        2500.0|
+---+-----------+----------+---+-------+--------------+



In [None]:
#15. Date Formatting
from pyspark.sql.functions import date_format
df_dateformat = df.withColumn("formatted_date", date_format(col("dob"), "MM/dd/yyyy"))
df_dateformat.show()

+---+-----------+----------+---+-------+--------------+
| id|       name|       dob|age| salary|formatted_date|
+---+-----------+----------+---+-------+--------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|    08/01/2024|
|  2| Jane Smith|2024-08-02| 34|2000.75|    08/02/2024|
|  3| Jake White|2024-08-03| 18| 3000.1|    08/03/2024|
|  4| Jill Black|2024-08-04| 45|4000.25|    08/04/2024|
|  5|James Brown|2024-08-05| 29| 1500.3|    08/05/2024|
|  6| James Bond|2024-08-06| 31|2500.45|    08/06/2024|
+---+-----------+----------+---+-------+--------------+



In [None]:
#16. Current date
from pyspark.sql.functions import current_date
df_current_date = df.withColumn("current_date", current_date())
df_current_date.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|current_date|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|  2025-08-09|
|  2| Jane Smith|2024-08-02| 34|2000.75|  2025-08-09|
|  3| Jake White|2024-08-03| 18| 3000.1|  2025-08-09|
|  4| Jill Black|2024-08-04| 45|4000.25|  2025-08-09|
|  5|James Brown|2024-08-05| 29| 1500.3|  2025-08-09|
|  6| James Bond|2024-08-06| 31|2500.45|  2025-08-09|
+---+-----------+----------+---+-------+------------+



In [None]:
#17. Current timestamp
df_current_timestamp = df.withColumn("current_timestamp", current_timestamp())
df_current_timestamp.show(truncate = False)

+---+-----------+----------+---+-------+--------------------------+
|id |name       |dob       |age|salary |current_timestamp         |
+---+-----------+----------+---+-------+--------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |2025-08-09 07:54:13.813148|
|2  |Jane Smith |2024-08-02|34 |2000.75|2025-08-09 07:54:13.813148|
|3  |Jake White |2024-08-03|18 |3000.1 |2025-08-09 07:54:13.813148|
|4  |Jill Black |2024-08-04|45 |4000.25|2025-08-09 07:54:13.813148|
|5  |James Brown|2024-08-05|29 |1500.3 |2025-08-09 07:54:13.813148|
|6  |James Bond |2024-08-06|31 |2500.45|2025-08-09 07:54:13.813148|
+---+-----------+----------+---+-------+--------------------------+



In [None]:
#18. Extracting year
df_year = df.withColumn("year", year(col("dob")))
df_year.show()

+---+-----------+----------+---+-------+----+
| id|       name|       dob|age| salary|year|
+---+-----------+----------+---+-------+----+
|  1|   John Doe|2024-08-01| 23| 1000.5|2024|
|  2| Jane Smith|2024-08-02| 34|2000.75|2024|
|  3| Jake White|2024-08-03| 18| 3000.1|2024|
|  4| Jill Black|2024-08-04| 45|4000.25|2024|
|  5|James Brown|2024-08-05| 29| 1500.3|2024|
|  6| James Bond|2024-08-06| 31|2500.45|2024|
+---+-----------+----------+---+-------+----+



In [None]:
#19. Add date
from pyspark.sql.functions import date_add
df_date_add = df.withColumn("date_after_10_days", date_add(col("dob"), 10))
df_date_add.show()

+---+-----------+----------+---+-------+------------------+
| id|       name|       dob|age| salary|date_after_10_days|
+---+-----------+----------+---+-------+------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|        2024-08-11|
|  2| Jane Smith|2024-08-02| 34|2000.75|        2024-08-12|
|  3| Jake White|2024-08-03| 18| 3000.1|        2024-08-13|
|  4| Jill Black|2024-08-04| 45|4000.25|        2024-08-14|
|  5|James Brown|2024-08-05| 29| 1500.3|        2024-08-15|
|  6| James Bond|2024-08-06| 31|2500.45|        2024-08-16|
+---+-----------+----------+---+-------+------------------+



In [None]:
#20. Date sub
df_date_sub = df.withColumn("date_before_10_days", date_sub(col("dob"), 10))
df_date_sub.show()

+---+-----------+----------+---+-------+-------------------+
| id|       name|       dob|age| salary|date_before_10_days|
+---+-----------+----------+---+-------+-------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|         2024-07-22|
|  2| Jane Smith|2024-08-02| 34|2000.75|         2024-07-23|
|  3| Jake White|2024-08-03| 18| 3000.1|         2024-07-24|
|  4| Jill Black|2024-08-04| 45|4000.25|         2024-07-25|
|  5|James Brown|2024-08-05| 29| 1500.3|         2024-07-26|
|  6| James Bond|2024-08-06| 31|2500.45|         2024-07-27|
+---+-----------+----------+---+-------+-------------------+



In [None]:
#21. Date Diff
#Calculate the difference in days between the current date and the dob
df_date_diff = df.withColumn("days_since_dob", date_diff(current_date(), col("dob")))
df_date_diff.show()

+---+-----------+----------+---+-------+--------------+
| id|       name|       dob|age| salary|days_since_dob|
+---+-----------+----------+---+-------+--------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|           373|
|  2| Jane Smith|2024-08-02| 34|2000.75|           372|
|  3| Jake White|2024-08-03| 18| 3000.1|           371|
|  4| Jill Black|2024-08-04| 45|4000.25|           370|
|  5|James Brown|2024-08-05| 29| 1500.3|           369|
|  6| James Bond|2024-08-06| 31|2500.45|           368|
+---+-----------+----------+---+-------+--------------+



In [None]:
#22. Convert the dob from string to date format
df_to_date = df.withColumn("dob_as_date", to_date(col("dob"), "yyyy-MM-dd"))
df_to_date.show()

+---+-----------+----------+---+-------+-----------+
| id|       name|       dob|age| salary|dob_as_date|
+---+-----------+----------+---+-------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5| 2024-08-01|
|  2| Jane Smith|2024-08-02| 34|2000.75| 2024-08-02|
|  3| Jake White|2024-08-03| 18| 3000.1| 2024-08-03|
|  4| Jill Black|2024-08-04| 45|4000.25| 2024-08-04|
|  5|James Brown|2024-08-05| 29| 1500.3| 2024-08-05|
|  6| James Bond|2024-08-06| 31|2500.45| 2024-08-06|
+---+-----------+----------+---+-------+-----------+



In [None]:
#23. To timestamp
df_to_timestamp = df.withColumn("asTimestamp", to_timestamp(col("dob"), "yyyy-MM-dd"))
df_to_timestamp.show(truncate = False)

+---+-----------+----------+---+-------+-------------------+
|id |name       |dob       |age|salary |asTimestamp        |
+---+-----------+----------+---+-------+-------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |2024-08-01 00:00:00|
|2  |Jane Smith |2024-08-02|34 |2000.75|2024-08-02 00:00:00|
|3  |Jake White |2024-08-03|18 |3000.1 |2024-08-03 00:00:00|
|4  |Jill Black |2024-08-04|45 |4000.25|2024-08-04 00:00:00|
|5  |James Brown|2024-08-05|29 |1500.3 |2024-08-05 00:00:00|
|6  |James Bond |2024-08-06|31 |2500.45|2024-08-06 00:00:00|
+---+-----------+----------+---+-------+-------------------+



In [None]:
#24. Window function = Aggregating the history over a sliding window of one day
df_window = df.withColumn("Agg", window(col("dob"), "1 day"))
df_window.show(truncate = False)

+---+-----------+----------+---+-------+------------------------------------------+
|id |name       |dob       |age|salary |Agg                                       |
+---+-----------+----------+---+-------+------------------------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |{2024-08-01 00:00:00, 2024-08-02 00:00:00}|
|2  |Jane Smith |2024-08-02|34 |2000.75|{2024-08-02 00:00:00, 2024-08-03 00:00:00}|
|3  |Jake White |2024-08-03|18 |3000.1 |{2024-08-03 00:00:00, 2024-08-04 00:00:00}|
|4  |Jill Black |2024-08-04|45 |4000.25|{2024-08-04 00:00:00, 2024-08-05 00:00:00}|
|5  |James Brown|2024-08-05|29 |1500.3 |{2024-08-05 00:00:00, 2024-08-06 00:00:00}|
|6  |James Bond |2024-08-06|31 |2500.45|{2024-08-06 00:00:00, 2024-08-07 00:00:00}|
+---+-----------+----------+---+-------+------------------------------------------+



In [None]:
#Rank, dense rank and row_numer
#25. Applying the ranking function to the salary column
from pyspark.sql.window import *
windowSpec = Window.orderBy(col("salary").desc())
df.withColumn("rank", rank().over(windowSpec)).withColumn("dense_rank", dense_rank().over(windowSpec)).withColumn("row_number", row_number().over(windowSpec)).show()

+---+-----------+----------+---+-------+----+----------+----------+
| id|       name|       dob|age| salary|rank|dense_rank|row_number|
+---+-----------+----------+---+-------+----+----------+----------+
|  4| Jill Black|2024-08-04| 45|4000.25|   1|         1|         1|
|  3| Jake White|2024-08-03| 18| 3000.1|   2|         2|         2|
|  6| James Bond|2024-08-06| 31|2500.45|   3|         3|         3|
|  2| Jane Smith|2024-08-02| 34|2000.75|   4|         4|         4|
|  5|James Brown|2024-08-05| 29| 1500.3|   5|         5|         5|
|  1|   John Doe|2024-08-01| 23| 1000.5|   6|         6|         6|
+---+-----------+----------+---+-------+----+----------+----------+



In [None]:
#26. Creating a new array column
df_array = df.withColumn("array_column", array(col("id"), col("age")))
df_array.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|array_column|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|     [1, 23]|
|  2| Jane Smith|2024-08-02| 34|2000.75|     [2, 34]|
|  3| Jake White|2024-08-03| 18| 3000.1|     [3, 18]|
|  4| Jill Black|2024-08-04| 45|4000.25|     [4, 45]|
|  5|James Brown|2024-08-05| 29| 1500.3|     [5, 29]|
|  6| James Bond|2024-08-06| 31|2500.45|     [6, 31]|
+---+-----------+----------+---+-------+------------+



In [None]:
#27. Array_contains - Check if an element is present in an array column
df_array_contains = df.withColumn("contains_id_1", array_contains(array(col("id")), 1))
df_array_contains.show()

+---+-----------+----------+---+-------+-------------+
| id|       name|       dob|age| salary|contains_id_1|
+---+-----------+----------+---+-------+-------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|         true|
|  2| Jane Smith|2024-08-02| 34|2000.75|        false|
|  3| Jake White|2024-08-03| 18| 3000.1|        false|
|  4| Jill Black|2024-08-04| 45|4000.25|        false|
|  5|James Brown|2024-08-05| 29| 1500.3|        false|
|  6| James Bond|2024-08-06| 31|2500.45|        false|
+---+-----------+----------+---+-------+-------------+



In [None]:
#28. Explode - create a new row for each element in the given array or map column
df_explode = df.withColumn("exploded_Array", explode(array(col("id"), col("age"))))
df_explode.show()

+---+-----------+----------+---+-------+--------------+
| id|       name|       dob|age| salary|exploded_Array|
+---+-----------+----------+---+-------+--------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|             1|
|  1|   John Doe|2024-08-01| 23| 1000.5|            23|
|  2| Jane Smith|2024-08-02| 34|2000.75|             2|
|  2| Jane Smith|2024-08-02| 34|2000.75|            34|
|  3| Jake White|2024-08-03| 18| 3000.1|             3|
|  3| Jake White|2024-08-03| 18| 3000.1|            18|
|  4| Jill Black|2024-08-04| 45|4000.25|             4|
|  4| Jill Black|2024-08-04| 45|4000.25|            45|
|  5|James Brown|2024-08-05| 29| 1500.3|             5|
|  5|James Brown|2024-08-05| 29| 1500.3|            29|
|  6| James Bond|2024-08-06| 31|2500.45|             6|
|  6| James Bond|2024-08-06| 31|2500.45|            31|
+---+-----------+----------+---+-------+--------------+



In [None]:
#29. Map = Creates a new map column
df_map = df.withColumn("map_column", create_map(lit("name"), col("name"),lit("age"), col("age")))
df_map.show(truncate = False)

+---+-----------+----------+---+-------+--------------------------------+
|id |name       |dob       |age|salary |map_column                      |
+---+-----------+----------+---+-------+--------------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |{name -> John Doe, age -> 23}   |
|2  |Jane Smith |2024-08-02|34 |2000.75|{name -> Jane Smith, age -> 34} |
|3  |Jake White |2024-08-03|18 |3000.1 |{name -> Jake White, age -> 18} |
|4  |Jill Black |2024-08-04|45 |4000.25|{name -> Jill Black, age -> 45} |
|5  |James Brown|2024-08-05|29 |1500.3 |{name -> James Brown, age -> 29}|
|6  |James Bond |2024-08-06|31 |2500.45|{name -> James Bond, age -> 31} |
+---+-----------+----------+---+-------+--------------------------------+



In [None]:
#30. Return the first non null value among the given column
df_coal = df.withColumn("coal_val", coalesce(col("name"), col("age")))
df_coal.show()

+---+-----------+----------+---+-------+-----------+
| id|       name|       dob|age| salary|   coal_val|
+---+-----------+----------+---+-------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|   John Doe|
|  2| Jane Smith|2024-08-02| 34|2000.75| Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1| Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25| Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3|James Brown|
|  6| James Bond|2024-08-06| 31|2500.45| James Bond|
+---+-----------+----------+---+-------+-----------+



In [None]:
#31. Check if the column is null
df_isnull = df.withColumn("is_name_null", isnull(col("name")))
df_isnull.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|is_name_null|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|       false|
|  2| Jane Smith|2024-08-02| 34|2000.75|       false|
|  3| Jake White|2024-08-03| 18| 3000.1|       false|
|  4| Jill Black|2024-08-04| 45|4000.25|       false|
|  5|James Brown|2024-08-05| 29| 1500.3|       false|
|  6| James Bond|2024-08-06| 31|2500.45|       false|
+---+-----------+----------+---+-------+------------+



In [None]:
#32. Isnan - check if the column is Nan
df_isnan = df.withColumn("is_age_nan", isnan(col("age")))
df_isnan.show()

+---+-----------+----------+---+-------+----------+
| id|       name|       dob|age| salary|is_age_nan|
+---+-----------+----------+---+-------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|     false|
|  2| Jane Smith|2024-08-02| 34|2000.75|     false|
|  3| Jake White|2024-08-03| 18| 3000.1|     false|
|  4| Jill Black|2024-08-04| 45|4000.25|     false|
|  5|James Brown|2024-08-05| 29| 1500.3|     false|
|  6| James Bond|2024-08-06| 31|2500.45|     false|
+---+-----------+----------+---+-------+----------+



In [None]:
#33. Apply the sha-2 hash function to the column
df_sha2 = df.withColumn("sha2_hash", sha2(col("name"), 256))
df_sha2.show(truncate = False)

+---+-----------+----------+---+-------+----------------------------------------------------------------+
|id |name       |dob       |age|salary |sha2_hash                                                       |
+---+-----------+----------+---+-------+----------------------------------------------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |6cea57c2fb6cbc2a40411135005760f241fffc3e5e67ab99882726431037f908|
|2  |Jane Smith |2024-08-02|34 |2000.75|a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288|
|3  |Jake White |2024-08-03|18 |3000.1 |46dae60ac51d1dfbb195b62ca2a6c4fd76f0d0ef5e32ee32a3b9987bc30251ef|
|4  |Jill Black |2024-08-04|45 |4000.25|7db233fa461a23bfe24e0c1b5f0cd82e8750969c1e9c4450761cf800425caba7|
|5  |James Brown|2024-08-05|29 |1500.3 |b3d60509901a9912f651bac21d52c7ddffc9bc8620f5052fbddd39e3c59f84b0|
|6  |James Bond |2024-08-06|31 |2500.45|80ae44e3fa55f4bb5a593e7406147b59a801feab279dc2154e0d5eb9f757dd4c|
+---+-----------+----------+---+-------+------

In [None]:
#34. Calculate the MD5 hash of a column
df_md5 = df.withColumn("md5_hash", md5(col("name")))
df_md5.show(truncate = False)

+---+-----------+----------+---+-------+--------------------------------+
|id |name       |dob       |age|salary |md5_hash                        |
+---+-----------+----------+---+-------+--------------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |4c2a904bafba06591225113ad17b5cec|
|2  |Jane Smith |2024-08-02|34 |2000.75|71768b5e2a0b3697eb3c0c6d4ebbbaf8|
|3  |Jake White |2024-08-03|18 |3000.1 |3ee9984296bc94702c3fa0b750b928fb|
|4  |Jill Black |2024-08-04|45 |4000.25|78b82745b366583ae84dde2d90114901|
|5  |James Brown|2024-08-05|29 |1500.3 |8495e8e406d3d625719ae2a9fb8d2f9b|
|6  |James Bond |2024-08-06|31 |2500.45|0a363424ef5ceaa17d33dfe4c545d7f3|
+---+-----------+----------+---+-------+--------------------------------+



In [None]:
#35. Increasing the id, monotonically. Generate a unique, monotonous increasing 64-bit integer for each row
df_mon = df.withColumn("inc_id", monotonically_increasing_id())
df_mon.show()

+---+-----------+----------+---+-------+----------+
| id|       name|       dob|age| salary|    inc_id|
+---+-----------+----------+---+-------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|         0|
|  2| Jane Smith|2024-08-02| 34|2000.75|         1|
|  3| Jake White|2024-08-03| 18| 3000.1|         2|
|  4| Jill Black|2024-08-04| 45|4000.25|8589934592|
|  5|James Brown|2024-08-05| 29| 1500.3|8589934593|
|  6| James Bond|2024-08-06| 31|2500.45|8589934594|
+---+-----------+----------+---+-------+----------+



In [None]:
#36. Length - returning the length of a string column
df_len = df.withColumn("name_length", length(col("name")))
df_len.show()

+---+-----------+----------+---+-------+-----------+
| id|       name|       dob|age| salary|name_length|
+---+-----------+----------+---+-------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|          8|
|  2| Jane Smith|2024-08-02| 34|2000.75|         10|
|  3| Jake White|2024-08-03| 18| 3000.1|         10|
|  4| Jill Black|2024-08-04| 45|4000.25|         10|
|  5|James Brown|2024-08-05| 29| 1500.3|         11|
|  6| James Bond|2024-08-06| 31|2500.45|         10|
+---+-----------+----------+---+-------+-----------+



In [None]:
#37. Upper and low
#convert all the characters of a string column to upper or lower case
df_upper = df.withColumn("upper_name", upper(col("name")))
df_upper.show()

+---+-----------+----------+---+-------+-----------+
| id|       name|       dob|age| salary| upper_name|
+---+-----------+----------+---+-------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|   JOHN DOE|
|  2| Jane Smith|2024-08-02| 34|2000.75| JANE SMITH|
|  3| Jake White|2024-08-03| 18| 3000.1| JAKE WHITE|
|  4| Jill Black|2024-08-04| 45|4000.25| JILL BLACK|
|  5|James Brown|2024-08-05| 29| 1500.3|JAMES BROWN|
|  6| James Bond|2024-08-06| 31|2500.45| JAMES BOND|
+---+-----------+----------+---+-------+-----------+



In [None]:
df_lower = df.withColumn("lower_name", lower(col("name"))).withColumn("upper_name", upper(col("name")))
df_lower.show()

+---+-----------+----------+---+-------+-----------+-----------+
| id|       name|       dob|age| salary| lower_name| upper_name|
+---+-----------+----------+---+-------+-----------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|   john doe|   JOHN DOE|
|  2| Jane Smith|2024-08-02| 34|2000.75| jane smith| JANE SMITH|
|  3| Jake White|2024-08-03| 18| 3000.1| jake white| JAKE WHITE|
|  4| Jill Black|2024-08-04| 45|4000.25| jill black| JILL BLACK|
|  5|James Brown|2024-08-05| 29| 1500.3|james brown|JAMES BROWN|
|  6| James Bond|2024-08-06| 31|2500.45| james bond| JAMES BOND|
+---+-----------+----------+---+-------+-----------+-----------+



In [None]:
#38. Trim functions
df_all = df.withColumn("name_trimmed", trim(col("name"))).withColumn("name_ltrim", ltrim(col("name"))).withColumn("name_rtrim", rtrim(col("name")))
df_all.show()

+---+-----------+----------+---+-------+------------+-----------+-----------+
| id|       name|       dob|age| salary|name_trimmed| name_ltrim| name_rtrim|
+---+-----------+----------+---+-------+------------+-----------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|    John Doe|   John Doe|   John Doe|
|  2| Jane Smith|2024-08-02| 34|2000.75|  Jane Smith| Jane Smith| Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1|  Jake White| Jake White| Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25|  Jill Black| Jill Black| Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3| James Brown|James Brown|James Brown|
|  6| James Bond|2024-08-06| 31|2500.45|  James Bond| James Bond| James Bond|
+---+-----------+----------+---+-------+------------+-----------+-----------+



In [None]:
#39. Absolute value
df_abs = df.withColumn("abs_salary", abs(col("salary")-3000))
df_abs.show()

+---+-----------+----------+---+-------+-------------------+
| id|       name|       dob|age| salary|         abs_salary|
+---+-----------+----------+---+-------+-------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|             1999.5|
|  2| Jane Smith|2024-08-02| 34|2000.75|             999.25|
|  3| Jake White|2024-08-03| 18| 3000.1|0.09999999999990905|
|  4| Jill Black|2024-08-04| 45|4000.25|            1000.25|
|  5|James Brown|2024-08-05| 29| 1500.3|             1499.7|
|  6| James Bond|2024-08-06| 31|2500.45|  499.5500000000002|
+---+-----------+----------+---+-------+-------------------+



In [None]:
#40. Sqrt
df_sqrt = df.withColumn("sqrt_age", sqrt(col("age")))
df_sqrt.show()

+---+-----------+----------+---+-------+------------------+
| id|       name|       dob|age| salary|          sqrt_age|
+---+-----------+----------+---+-------+------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5| 4.795831523312719|
|  2| Jane Smith|2024-08-02| 34|2000.75| 5.830951894845301|
|  3| Jake White|2024-08-03| 18| 3000.1| 4.242640687119285|
|  4| Jill Black|2024-08-04| 45|4000.25| 6.708203932499369|
|  5|James Brown|2024-08-05| 29| 1500.3| 5.385164807134504|
|  6| James Bond|2024-08-06| 31|2500.45|5.5677643628300215|
+---+-----------+----------+---+-------+------------------+



In [None]:
#41. Exponential
df_exp = df.withColumn("exp_age", exp(col("age")))
df_exp.show()

+---+-----------+----------+---+-------+--------------------+
| id|       name|       dob|age| salary|             exp_age|
+---+-----------+----------+---+-------+--------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5| 9.744803446248903E9|
|  2| Jane Smith|2024-08-02| 34|2000.75|5.834617425274549E14|
|  3| Jake White|2024-08-03| 18| 3000.1| 6.565996913733051E7|
|  4| Jill Black|2024-08-04| 45|4000.25|3.493427105748509...|
|  5|James Brown|2024-08-05| 29| 1500.3|3.931334297144042E12|
|  6| James Bond|2024-08-06| 31|2500.45|2.904884966524742...|
+---+-----------+----------+---+-------+--------------------+



In [None]:
#42. Log, log10, log 2
df_all = df.withColumn("log_age", log(col("age"))).withColumn("log10_age", log10(col("age"))).withColumn("log2_age", log2(col("age")))
df_all.show()

+---+-----------+----------+---+-------+------------------+------------------+-----------------+
| id|       name|       dob|age| salary|           log_age|         log10_age|         log2_age|
+---+-----------+----------+---+-------+------------------+------------------+-----------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|3.1354942159291497|1.3617278360175928|4.523561956057013|
|  2| Jane Smith|2024-08-02| 34|2000.75|3.5263605246161616|1.5314789170422551| 5.08746284125034|
|  3| Jake White|2024-08-03| 18| 3000.1|2.8903717578961645| 1.255272505103306|4.169925001442312|
|  4| Jill Black|2024-08-04| 45|4000.25|3.8066624897703196|1.6532125137753437|5.491853096329675|
|  5|James Brown|2024-08-05| 29| 1500.3| 3.367295829986474| 1.462397997898956|4.857980995127573|
|  6| James Bond|2024-08-06| 31|2500.45|3.4339872044851463|1.4913616938342726|4.954196310386876|
+---+-----------+----------+---+-------+------------------+------------------+-----------------+



In [None]:
#43. Greatest and least
df_greatest = df.withColumn("greatest value", greatest(col("id"), col("age"))).withColumn("least_value", least(col("id"), col("age")))
df_greatest.show()

+---+-----------+----------+---+-------+--------------+-----------+
| id|       name|       dob|age| salary|greatest value|least_value|
+---+-----------+----------+---+-------+--------------+-----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|            23|          1|
|  2| Jane Smith|2024-08-02| 34|2000.75|            34|          2|
|  3| Jake White|2024-08-03| 18| 3000.1|            18|          3|
|  4| Jill Black|2024-08-04| 45|4000.25|            45|          4|
|  5|James Brown|2024-08-05| 29| 1500.3|            29|          5|
|  6| James Bond|2024-08-06| 31|2500.45|            31|          6|
+---+-----------+----------+---+-------+--------------+-----------+



In [None]:
#44. Power
df_pow = df.withColumn("power result", pow(col("id"), col("age")))
df_pow.show(truncate = False)

+---+-----------+----------+---+-------+---------------------+
|id |name       |dob       |age|salary |power result         |
+---+-----------+----------+---+-------+---------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |1.0                  |
|2  |Jane Smith |2024-08-02|34 |2000.75|1.7179869184E10      |
|3  |Jake White |2024-08-03|18 |3000.1 |3.87420489E8         |
|4  |Jill Black |2024-08-04|45 |4000.25|1.2379400392853803E27|
|5  |James Brown|2024-08-05|29 |1500.3 |1.8626451492309572E20|
|6  |James Bond |2024-08-06|31 |2500.45|1.3264435183244001E24|
+---+-----------+----------+---+-------+---------------------+



In [None]:
#45.Round and Bround
#Round the value to the nearest integer, or to the nearest integer with ties broken by rounding away from zero
df_round = df.withColumn("round_sal", round(col("salary"), 0)).withColumn("bround_sal", bround(col("salary"),0))
df_round.show()

+---+-----------+----------+---+-------+---------+----------+
| id|       name|       dob|age| salary|round_sal|bround_sal|
+---+-----------+----------+---+-------+---------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|   1001.0|    1000.0|
|  2| Jane Smith|2024-08-02| 34|2000.75|   2001.0|    2001.0|
|  3| Jake White|2024-08-03| 18| 3000.1|   3000.0|    3000.0|
|  4| Jill Black|2024-08-04| 45|4000.25|   4000.0|    4000.0|
|  5|James Brown|2024-08-05| 29| 1500.3|   1500.0|    1500.0|
|  6| James Bond|2024-08-06| 31|2500.45|   2500.0|    2500.0|
+---+-----------+----------+---+-------+---------+----------+



In [None]:
#46. Degree and radians
df_deg_rad = df.withColumn("degrees_val", degrees(col("age"))).withColumn("radians_value", radians(col("age")))
df_deg_rad.show()

+---+-----------+----------+---+-------+------------------+------------------+
| id|       name|       dob|age| salary|       degrees_val|     radians_value|
+---+-----------+----------+---+-------+------------------+------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|1317.8029288008934|0.4014257279586958|
|  2| Jane Smith|2024-08-02| 34|2000.75| 1948.056503444799|0.5934119456780721|
|  3| Jake White|2024-08-03| 18| 3000.1| 1031.324031235482|0.3141592653589793|
|  4| Jill Black|2024-08-04| 45|4000.25|2578.3100780887044|0.7853981633974483|
|  5|James Brown|2024-08-05| 29| 1500.3|1661.5776058793874|0.5061454830783556|
|  6| James Bond|2024-08-06| 31|2500.45| 1776.169164905552|0.5410520681182421|
+---+-----------+----------+---+-------+------------------+------------------+



In [None]:
#47. Signum - Computes the signum of an number.
#-1 if it's a negative, 0 if it's zero, and 1 if it's positive
df_signum = df.withColumn("signum_val", signum(col("age")))
df_signum.show()

+---+-----------+----------+---+-------+----------+
| id|       name|       dob|age| salary|signum_val|
+---+-----------+----------+---+-------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|       1.0|
|  2| Jane Smith|2024-08-02| 34|2000.75|       1.0|
|  3| Jake White|2024-08-03| 18| 3000.1|       1.0|
|  4| Jill Black|2024-08-04| 45|4000.25|       1.0|
|  5|James Brown|2024-08-05| 29| 1500.3|       1.0|
|  6| James Bond|2024-08-06| 31|2500.45|       1.0|
+---+-----------+----------+---+-------+----------+



In [None]:
#48. Hex and unhex - convert a value to hex and back
df_all = df.withColumn("hex", hex(col("id"))).withColumn("unhex_name", unhex(hex(col("id"))))
df_all.show()

+---+-----------+----------+---+-------+---+----------+
| id|       name|       dob|age| salary|hex|unhex_name|
+---+-----------+----------+---+-------+---+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|  1|      [01]|
|  2| Jane Smith|2024-08-02| 34|2000.75|  2|      [02]|
|  3| Jake White|2024-08-03| 18| 3000.1|  3|      [03]|
|  4| Jill Black|2024-08-04| 45|4000.25|  4|      [04]|
|  5|James Brown|2024-08-05| 29| 1500.3|  5|      [05]|
|  6| James Bond|2024-08-06| 31|2500.45|  6|      [06]|
+---+-----------+----------+---+-------+---+----------+



In [None]:
#49. NVL & NVL2 - replace null value with a specified value
df_all = df.withColumn("nvl_age", nvl(col("age"), col("id"))).withColumn("nvl2_age", nvl2(col("age"), col("id"), col("salary")))
df_all.show()

+---+-----------+----------+---+-------+-------+--------+
| id|       name|       dob|age| salary|nvl_age|nvl2_age|
+---+-----------+----------+---+-------+-------+--------+
|  1|   John Doe|2024-08-01| 23| 1000.5|     23|     1.0|
|  2| Jane Smith|2024-08-02| 34|2000.75|     34|     2.0|
|  3| Jake White|2024-08-03| 18| 3000.1|     18|     3.0|
|  4| Jill Black|2024-08-04| 45|4000.25|     45|     4.0|
|  5|James Brown|2024-08-05| 29| 1500.3|     29|     5.0|
|  6| James Bond|2024-08-06| 31|2500.45|     31|     6.0|
+---+-----------+----------+---+-------+-------+--------+



In [None]:
#50. Reverse
df_reverse = df.withColumn("reverse_name", reverse(col("name")))
df_reverse.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|reverse_name|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|    eoD nhoJ|
|  2| Jane Smith|2024-08-02| 34|2000.75|  htimS enaJ|
|  3| Jake White|2024-08-03| 18| 3000.1|  etihW ekaJ|
|  4| Jill Black|2024-08-04| 45|4000.25|  kcalB lliJ|
|  5|James Brown|2024-08-05| 29| 1500.3| nworB semaJ|
|  6| James Bond|2024-08-06| 31|2500.45|  dnoB semaJ|
+---+-----------+----------+---+-------+------------+



In [None]:
#51. Initcap = convert the first letter of each word to uppercase
df_init_cap = df.withColumn("capitalized_name", initcap(col("name")))
df_init_cap.show()

+---+-----------+----------+---+-------+----------------+
| id|       name|       dob|age| salary|capitalized_name|
+---+-----------+----------+---+-------+----------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|        John Doe|
|  2| Jane Smith|2024-08-02| 34|2000.75|      Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1|      Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25|      Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3|     James Brown|
|  6| James Bond|2024-08-06| 31|2500.45|      James Bond|
+---+-----------+----------+---+-------+----------------+



In [None]:
#52. Instring - Returns the position of the first occurence of a subtring
df_instr = df.withColumn("position of a", instr(col("name"), "a"))
print(df_instr.show())
print(df_instr.printSchema())

+---+-----------+----------+---+-------+-------------+
| id|       name|       dob|age| salary|position of a|
+---+-----------+----------+---+-------+-------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|            0|
|  2| Jane Smith|2024-08-02| 34|2000.75|            2|
|  3| Jake White|2024-08-03| 18| 3000.1|            2|
|  4| Jill Black|2024-08-04| 45|4000.25|            8|
|  5|James Brown|2024-08-05| 29| 1500.3|            2|
|  6| James Bond|2024-08-06| 31|2500.45|            2|
+---+-----------+----------+---+-------+-------------+

None
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- position of a: integer (nullable = true)

None


In [None]:
#53. Locate - similar to instr, but can start the search from a specified position
df_locate = df.withColumn("locate_doe", locate("Doe", col("name")))
df_locate.show()

+---+-----------+----------+---+-------+----------+
| id|       name|       dob|age| salary|locate_doe|
+---+-----------+----------+---+-------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|         6|
|  2| Jane Smith|2024-08-02| 34|2000.75|         0|
|  3| Jake White|2024-08-03| 18| 3000.1|         0|
|  4| Jill Black|2024-08-04| 45|4000.25|         0|
|  5|James Brown|2024-08-05| 29| 1500.3|         0|
|  6| James Bond|2024-08-06| 31|2500.45|         0|
+---+-----------+----------+---+-------+----------+



In [None]:
#54. Soundex - convert a string to its soundex code, useful for phonetic matching
df_soundex = df.withColumn("soundex_name", soundex(col("name")))
df_soundex.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|soundex_name|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|        J530|
|  2| Jane Smith|2024-08-02| 34|2000.75|        J525|
|  3| Jake White|2024-08-03| 18| 3000.1|        J230|
|  4| Jill Black|2024-08-04| 45|4000.25|        J414|
|  5|James Brown|2024-08-05| 29| 1500.3|        J521|
|  6| James Bond|2024-08-06| 31|2500.45|        J521|
+---+-----------+----------+---+-------+------------+



In [None]:
#55. Levenshtein- computes the levenshtein distance between 2 strings
df_levenshtein = df.withColumn("levenshtein_distance", levenshtein(col("name"), lit("Jon Doe")))
df_levenshtein.show()

+---+-----------+----------+---+-------+--------------------+
| id|       name|       dob|age| salary|levenshtein_distance|
+---+-----------+----------+---+-------+--------------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|                   1|
|  2| Jane Smith|2024-08-02| 34|2000.75|                   7|
|  3| Jake White|2024-08-03| 18| 3000.1|                   7|
|  4| Jill Black|2024-08-04| 45|4000.25|                   8|
|  5|James Brown|2024-08-05| 29| 1500.3|                   8|
|  6| James Bond|2024-08-06| 31|2500.45|                   7|
+---+-----------+----------+---+-------+--------------------+



In [None]:
#56. Convert a number from one base to another
df_conv = df.withColumn("binary_value", conv(col("id"), 10,2))
df_conv.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|binary_value|
+---+-----------+----------+---+-------+------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|           1|
|  2| Jane Smith|2024-08-02| 34|2000.75|          10|
|  3| Jake White|2024-08-03| 18| 3000.1|          11|
|  4| Jill Black|2024-08-04| 45|4000.25|         100|
|  5|James Brown|2024-08-05| 29| 1500.3|         101|
|  6| James Bond|2024-08-06| 31|2500.45|         110|
+---+-----------+----------+---+-------+------------+



In [None]:
#57. Translate - replace characters in a string into other characters
df_translate = df.withColumn("translated_name", translate(col("name"),"o","a"))
df_translate.show()

+---+-----------+----------+---+-------+---------------+
| id|       name|       dob|age| salary|translated_name|
+---+-----------+----------+---+-------+---------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|       Jahn Dae|
|  2| Jane Smith|2024-08-02| 34|2000.75|     Jane Smith|
|  3| Jake White|2024-08-03| 18| 3000.1|     Jake White|
|  4| Jill Black|2024-08-04| 45|4000.25|     Jill Black|
|  5|James Brown|2024-08-05| 29| 1500.3|    James Brawn|
|  6| James Bond|2024-08-06| 31|2500.45|     James Band|
+---+-----------+----------+---+-------+---------------+



In [None]:
#58. CRC32, computes cyclic redundancy check value
df_crc32 = df.withColumn("crc32_name", crc32(col("name")))
df_crc32.show()

+---+-----------+----------+---+-------+----------+
| id|       name|       dob|age| salary|crc32_name|
+---+-----------+----------+---+-------+----------+
|  1|   John Doe|2024-08-01| 23| 1000.5|1782059462|
|  2| Jane Smith|2024-08-02| 34|2000.75|3280634359|
|  3| Jake White|2024-08-03| 18| 3000.1| 931505628|
|  4| Jill Black|2024-08-04| 45|4000.25|3628743810|
|  5|James Brown|2024-08-05| 29| 1500.3|3837056040|
|  6| James Bond|2024-08-06| 31|2500.45|1009390456|
+---+-----------+----------+---+-------+----------+



In [None]:
#59. UUID - Generate a column of UUIDs
df_uuid = df.withColumn("uuid", expr("uuid()"))
df_uuid.show(truncate = False)

+---+-----------+----------+---+-------+------------------------------------+
|id |name       |dob       |age|salary |uuid                                |
+---+-----------+----------+---+-------+------------------------------------+
|1  |John Doe   |2024-08-01|23 |1000.5 |3fce8c42-e9f5-4a63-9b19-a0c3573a2270|
|2  |Jane Smith |2024-08-02|34 |2000.75|5b419152-0751-47a3-b5e6-56eebd4c4555|
|3  |Jake White |2024-08-03|18 |3000.1 |ac2dbd52-5241-4c94-a9d4-c789a02e31c7|
|4  |Jill Black |2024-08-04|45 |4000.25|7b905475-4306-4b19-9385-2b22de311314|
|5  |James Brown|2024-08-05|29 |1500.3 |b8f9424d-2bef-4e4e-8e7a-9ee283d8bcd2|
|6  |James Bond |2024-08-06|31 |2500.45|daf0db39-ee57-44de-80d4-a16acb02a99a|
+---+-----------+----------+---+-------+------------------------------------+



In [None]:
#60. percent rank - Compute the percent rank of a row within a window partition
windowSpec = Window.orderBy(col("salary").desc())
df_percent_rank = df.withColumn("percent_rank", percent_rank().over(windowSpec))
df_percent_rank.show()

+---+-----------+----------+---+-------+------------+
| id|       name|       dob|age| salary|percent_rank|
+---+-----------+----------+---+-------+------------+
|  4| Jill Black|2024-08-04| 45|4000.25|         0.0|
|  3| Jake White|2024-08-03| 18| 3000.1|         0.2|
|  6| James Bond|2024-08-06| 31|2500.45|         0.4|
|  2| Jane Smith|2024-08-02| 34|2000.75|         0.6|
|  5|James Brown|2024-08-05| 29| 1500.3|         0.8|
|  1|   John Doe|2024-08-01| 23| 1000.5|         1.0|
+---+-----------+----------+---+-------+------------+



In [None]:
#61. Cumulative distribution of a vlaue in a group of values
df_cume = df.withColumn("cume_dist", cume_dist().over(windowSpec))
df_cume.show()

+---+-----------+----------+---+-------+-------------------+
| id|       name|       dob|age| salary|          cume_dist|
+---+-----------+----------+---+-------+-------------------+
|  4| Jill Black|2024-08-04| 45|4000.25|0.16666666666666666|
|  3| Jake White|2024-08-03| 18| 3000.1| 0.3333333333333333|
|  6| James Bond|2024-08-06| 31|2500.45|                0.5|
|  2| Jane Smith|2024-08-02| 34|2000.75| 0.6666666666666666|
|  5|James Brown|2024-08-05| 29| 1500.3| 0.8333333333333334|
|  1|   John Doe|2024-08-01| 23| 1000.5|                1.0|
+---+-----------+----------+---+-------+-------------------+



In [None]:
#62. NTile = distribute the rows of an ordered partition into a specified number of buckets
df.withColumn("ntile", ntile(3).over(windowSpec)).show()

+---+-----------+----------+---+-------+-----+
| id|       name|       dob|age| salary|ntile|
+---+-----------+----------+---+-------+-----+
|  4| Jill Black|2024-08-04| 45|4000.25|    1|
|  3| Jake White|2024-08-03| 18| 3000.1|    1|
|  6| James Bond|2024-08-06| 31|2500.45|    2|
|  2| Jane Smith|2024-08-02| 34|2000.75|    2|
|  5|James Brown|2024-08-05| 29| 1500.3|    3|
|  1|   John Doe|2024-08-01| 23| 1000.5|    3|
+---+-----------+----------+---+-------+-----+



In [None]:
#63. Flatten an array of arrays into a single array
df.withColumn("flattened_array",flatten(array(array(lit(1), lit(2)), array(lit(3))))).show()

+---+-----------+----------+---+-------+---------------+
| id|       name|       dob|age| salary|flattened_array|
+---+-----------+----------+---+-------+---------------+
|  1|   John Doe|2024-08-01| 23| 1000.5|      [1, 2, 3]|
|  2| Jane Smith|2024-08-02| 34|2000.75|      [1, 2, 3]|
|  3| Jake White|2024-08-03| 18| 3000.1|      [1, 2, 3]|
|  4| Jill Black|2024-08-04| 45|4000.25|      [1, 2, 3]|
|  5|James Brown|2024-08-05| 29| 1500.3|      [1, 2, 3]|
|  6| James Bond|2024-08-06| 31|2500.45|      [1, 2, 3]|
+---+-----------+----------+---+-------+---------------+



In [None]:
#64. Grouping Id -returns the level of grouping applied
from pyspark.sql import functions as F
df_grouping_sets = df.cube("age","salary").agg(grouping_id().alias("grouping_id"), F.sum("salary"))
df_grouping_sets.show()

+----+-------+-----------+-----------+
| age| salary|grouping_id|sum(salary)|
+----+-------+-----------+-----------+
|NULL| 1000.5|          2|     1000.5|
|  34|   NULL|          1|    2000.75|
|NULL| 3000.1|          2|     3000.1|
|NULL|   NULL|          3|   14002.35|
|  23|   NULL|          1|     1000.5|
|  34|2000.75|          0|    2000.75|
|NULL|2000.75|          2|    2000.75|
|  23| 1000.5|          0|     1000.5|
|  18| 3000.1|          0|     3000.1|
|  18|   NULL|          1|     3000.1|
|  29| 1500.3|          0|     1500.3|
|NULL| 1500.3|          2|     1500.3|
|  45|   NULL|          1|    4000.25|
|NULL|4000.25|          2|    4000.25|
|  29|   NULL|          1|     1500.3|
|  45|4000.25|          0|    4000.25|
|NULL|2500.45|          2|    2500.45|
|  31|   NULL|          1|    2500.45|
|  31|2500.45|          0|    2500.45|
+----+-------+-----------+-----------+



In [None]:
#65. Rollup - Used for multi-dimensional aggregates, similar to cube but with a subset of it.
df.rollup("age","salary").count().show()

+----+-------+-----+
| age| salary|count|
+----+-------+-----+
|  34|   NULL|    1|
|NULL|   NULL|    6|
|  23|   NULL|    1|
|  34|2000.75|    1|
|  23| 1000.5|    1|
|  18| 3000.1|    1|
|  18|   NULL|    1|
|  29| 1500.3|    1|
|  45|   NULL|    1|
|  29|   NULL|    1|
|  45|4000.25|    1|
|  31|   NULL|    1|
|  31|2500.45|    1|
+----+-------+-----+



In [None]:
#66. Returns the Pearson correlation coefficient between the two column
df_corr = df.select(corr(col("age"), col("salary")))
df_corr.show()

+------------------+
| corr(age, salary)|
+------------------+
|0.5156484761493896|
+------------------+



In [None]:
#67. Collect_list = returns all values from an aggregated group as a list
df_all = df.groupBy("age").agg(collect_list("name").alias("names")).show()
df_all

+---+-------------+
|age|        names|
+---+-------------+
| 34| [Jane Smith]|
| 18| [Jake White]|
| 23|   [John Doe]|
| 29|[James Brown]|
| 31| [James Bond]|
| 45| [Jill Black]|
+---+-------------+

