In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._


In [19]:
val address = Seq(
    (1,"14851 Washington RD","DE"),
    (2,"21821 Margarita ST","NY"),
    (3,"31311 Siemon Ave","CA")
)

address: Seq[(Int, String, String)] = List((1,14851 Washington RD,DE), (2,21821 Margarita ST,NY), (3,31311 Siemon Ave,CA))


In [20]:
val df_address = address.toDF("ID","Address","State")
df_address.show()

+---+-------------------+-----+
| ID|            Address|State|
+---+-------------------+-----+
|  1|14851 Washington RD|   DE|
|  2| 21821 Margarita ST|   NY|
|  3|   31311 Siemon Ave|   CA|
+---+-------------------+-----+



df_address: org.apache.spark.sql.DataFrame = [ID: int, Address: string ... 1 more field]


# String Formatting

In [21]:
df_address.withColumn("Address",regexp_replace($"Address","RD","Road")).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|  21821 Margarita ST|   NY|
|  3|    31311 Siemon Ave|   CA|
+---+--------------------+-----+



In [22]:
df_address.withColumn("Address",regexp_replace($"Address","RD|ST|Ave","Road")).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|21821 Margarita Road|   NY|
|  3|   31311 Siemon Road|   CA|
+---+--------------------+-----+



In [23]:
df_address.withColumn("Address",regexp_replace($"Address","","Road")).show()

<console>: 2: error: invalid escape character

# Change Value by Condition

In [26]:
df_address.withColumn("Address", when($"Address".endsWith("RD"),regexp_replace($"Address","RD","Road"))
                     .when($"Address".endsWith("ST"),regexp_replace($"Address","ST","Street"))
                     .when($"Address".endsWith("Ave"),regexp_replace($"Address","Ave","Avenue"))
                     .otherwise("Unknown")
                     ).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|21821 Margarita S...|   NY|
|  3| 31311 Siemon Avenue|   CA|
+---+--------------------+-----+



In [31]:
val data = List(("Reyner","","Wongso","0312","M",12000),
               ("Robert","","Williams","0412","",13000),
               ("Sekar","Alisha","Firdaus","0505","F",14000),
               ("Richard","Mary","B","0404","X",10000))

val cols = Seq("firstName","middleName","lastName","DOB","gender","salary")

data: List[(String, String, String, String, String, Int)] = List((Reyner,"",Wongso,0312,M,12000), (Robert,"",Williams,0412,"",13000), (Sekar,Alisha,Firdaus,0505,F,14000), (Richard,Mary,B,0404,X,10000))
cols: Seq[String] = List(firstName, middleName, lastName, DOB, gender, salary)


In [32]:
val df = spark.createDataFrame(data).toDF(cols:_*)
df.show()

+---------+----------+--------+----+------+------+
|firstName|middleName|lastName| DOB|gender|salary|
+---------+----------+--------+----+------+------+
|   Reyner|          |  Wongso|0312|     M| 12000|
|   Robert|          |Williams|0412|      | 13000|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|
|  Richard|      Mary|       B|0404|     X| 10000|
+---------+----------+--------+----+------+------+



df: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 4 more fields]


In [33]:
val df1 = df.withColumn("newGender",when(col("gender") === "M","Male")
                        .otherwise("Unknown"))

df1.show()

+---------+----------+--------+----+------+------+---------+
|firstName|middleName|lastName| DOB|gender|salary|newGender|
+---------+----------+--------+----+------+------+---------+
|   Reyner|          |  Wongso|0312|     M| 12000|     Male|
|   Robert|          |Williams|0412|      | 13000|  Unknown|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|  Unknown|
|  Richard|      Mary|       B|0404|     X| 10000|  Unknown|
+---------+----------+--------+----+------+------+---------+



df1: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 5 more fields]


In [34]:
val df2 = df.withColumn("newGender",when(col("gender") === "M","Male")
                        .when(col("gender") === "F","Female")
                        .otherwise("Unknown"))

df2.show()

+---------+----------+--------+----+------+------+---------+
|firstName|middleName|lastName| DOB|gender|salary|newGender|
+---------+----------+--------+----+------+------+---------+
|   Reyner|          |  Wongso|0312|     M| 12000|     Male|
|   Robert|          |Williams|0412|      | 13000|  Unknown|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|   Female|
|  Richard|      Mary|       B|0404|     X| 10000|  Unknown|
+---------+----------+--------+----+------+------+---------+



df2: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 5 more fields]


# Trim

In [36]:
val data = Seq((1,"ABC  "),
               (2,"  ABC  "),
              (3,"   ABC"))
val df = data.toDF("col1","col2")
df.show()

+----+-------+
|col1|   col2|
+----+-------+
|   1|  ABC  |
|   2|  ABC  |
|   3|    ABC|
+----+-------+



data: Seq[(Int, String)] = List((1,"ABC  "), (2,"  ABC  "), (3,"   ABC"))
df: org.apache.spark.sql.DataFrame = [col1: int, col2: string]


In [37]:
df.withColumn("col2",trim(col("col2"))).show()

+----+----+
|col1|col2|
+----+----+
|   1| ABC|
|   2| ABC|
|   3| ABC|
+----+----+



In [38]:
df.withColumn("col2",rtrim(col("col2"))).show()

+----+------+
|col1|  col2|
+----+------+
|   1|   ABC|
|   2|   ABC|
|   3|   ABC|
+----+------+



In [40]:
df.withColumn("col2",ltrim(col("col2"))).show()

+----+-----+
|col1| col2|
+----+-----+
|   1|ABC  |
|   2|ABC  |
|   3|  ABC|
+----+-----+

