In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._


In [19]:
val address = Seq(
    (1,"14851 Washington RD","DE"),
    (2,"21821 Margarita ST","NY"),
    (3,"31311 Siemon Ave","CA")
)

address: Seq[(Int, String, String)] = List((1,14851 Washington RD,DE), (2,21821 Margarita ST,NY), (3,31311 Siemon Ave,CA))


In [20]:
val df_address = address.toDF("ID","Address","State")
df_address.show()

+---+-------------------+-----+
| ID|            Address|State|
+---+-------------------+-----+
|  1|14851 Washington RD|   DE|
|  2| 21821 Margarita ST|   NY|
|  3|   31311 Siemon Ave|   CA|
+---+-------------------+-----+



df_address: org.apache.spark.sql.DataFrame = [ID: int, Address: string ... 1 more field]


# String Replace

In [21]:
df_address.withColumn("Address",regexp_replace($"Address","RD","Road")).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|  21821 Margarita ST|   NY|
|  3|    31311 Siemon Ave|   CA|
+---+--------------------+-----+



In [22]:
df_address.withColumn("Address",regexp_replace($"Address","RD|ST|Ave","Road")).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|21821 Margarita Road|   NY|
|  3|   31311 Siemon Road|   CA|
+---+--------------------+-----+



In [23]:
df_address.withColumn("Address",regexp_replace($"Address","","Road")).show()

<console>: 2: error: invalid escape character

# When Otherwise

In [26]:
df_address.withColumn("Address", when($"Address".endsWith("RD"),regexp_replace($"Address","RD","Road"))
                     .when($"Address".endsWith("ST"),regexp_replace($"Address","ST","Street"))
                     .when($"Address".endsWith("Ave"),regexp_replace($"Address","Ave","Avenue"))
                     .otherwise("Unknown")
                     ).show()

+---+--------------------+-----+
| ID|             Address|State|
+---+--------------------+-----+
|  1|14851 Washington ...|   DE|
|  2|21821 Margarita S...|   NY|
|  3| 31311 Siemon Avenue|   CA|
+---+--------------------+-----+



In [31]:
val data = List(("Reyner","","Wongso","0312","M",12000),
               ("Robert","","Williams","0412","",13000),
               ("Sekar","Alisha","Firdaus","0505","F",14000),
               ("Richard","Mary","B","0404","X",10000))

val cols = Seq("firstName","middleName","lastName","DOB","gender","salary")

data: List[(String, String, String, String, String, Int)] = List((Reyner,"",Wongso,0312,M,12000), (Robert,"",Williams,0412,"",13000), (Sekar,Alisha,Firdaus,0505,F,14000), (Richard,Mary,B,0404,X,10000))
cols: Seq[String] = List(firstName, middleName, lastName, DOB, gender, salary)


In [32]:
val df = spark.createDataFrame(data).toDF(cols:_*)
df.show()

+---------+----------+--------+----+------+------+
|firstName|middleName|lastName| DOB|gender|salary|
+---------+----------+--------+----+------+------+
|   Reyner|          |  Wongso|0312|     M| 12000|
|   Robert|          |Williams|0412|      | 13000|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|
|  Richard|      Mary|       B|0404|     X| 10000|
+---------+----------+--------+----+------+------+



df: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 4 more fields]


In [33]:
val df1 = df.withColumn("newGender",when(col("gender") === "M","Male")
                        .otherwise("Unknown"))

df1.show()

+---------+----------+--------+----+------+------+---------+
|firstName|middleName|lastName| DOB|gender|salary|newGender|
+---------+----------+--------+----+------+------+---------+
|   Reyner|          |  Wongso|0312|     M| 12000|     Male|
|   Robert|          |Williams|0412|      | 13000|  Unknown|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|  Unknown|
|  Richard|      Mary|       B|0404|     X| 10000|  Unknown|
+---------+----------+--------+----+------+------+---------+



df1: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 5 more fields]


In [34]:
val df2 = df.withColumn("newGender",when(col("gender") === "M","Male")
                        .when(col("gender") === "F","Female")
                        .otherwise("Unknown"))

df2.show()

+---------+----------+--------+----+------+------+---------+
|firstName|middleName|lastName| DOB|gender|salary|newGender|
+---------+----------+--------+----+------+------+---------+
|   Reyner|          |  Wongso|0312|     M| 12000|     Male|
|   Robert|          |Williams|0412|      | 13000|  Unknown|
|    Sekar|    Alisha| Firdaus|0505|     F| 14000|   Female|
|  Richard|      Mary|       B|0404|     X| 10000|  Unknown|
+---------+----------+--------+----+------+------+---------+



df2: org.apache.spark.sql.DataFrame = [firstName: string, middleName: string ... 5 more fields]


# Trim

In [36]:
val data = Seq((1,"ABC  "),
               (2,"  ABC  "),
              (3,"   ABC"))
val df = data.toDF("col1","col2")
df.show()

+----+-------+
|col1|   col2|
+----+-------+
|   1|  ABC  |
|   2|  ABC  |
|   3|    ABC|
+----+-------+



data: Seq[(Int, String)] = List((1,"ABC  "), (2,"  ABC  "), (3,"   ABC"))
df: org.apache.spark.sql.DataFrame = [col1: int, col2: string]


In [37]:
df.withColumn("col2",trim(col("col2"))).show()

+----+----+
|col1|col2|
+----+----+
|   1| ABC|
|   2| ABC|
|   3| ABC|
+----+----+



In [38]:
df.withColumn("col2",rtrim(col("col2"))).show()

+----+------+
|col1|  col2|
+----+------+
|   1|   ABC|
|   2|   ABC|
|   3|   ABC|
+----+------+



In [40]:
df.withColumn("col2",ltrim(col("col2"))).show()

+----+-----+
|col1| col2|
+----+-----+
|   1|ABC  |
|   2|ABC  |
|   3|  ABC|
+----+-----+



# Split

In [41]:
val data = Seq(("Sharon, Zefanya, Setiawan","2003","F",10000),
              ("Lintang, Diah, Ayuningtyas","2003","F",50000),
              ("Matthew, Aaron, Sugiyarto","2003","M",20000)
              )

data: Seq[(String, String, String, Int)] = List((Sharon, Zefanya, Setiawan,2003,F,10000), (Lintang, Diah, Ayuningtyas,2003,F,50000), (Matthew, Aaron, Sugiyarto,2003,M,20000))


In [42]:
val df = data.toDF("Name","DOB Year","Gender","Salary")

df: org.apache.spark.sql.DataFrame = [Name: string, DOB Year: string ... 2 more fields]


In [44]:
val splitDf = (df.withColumn("firstName",split(col("Name"),",").getItem(0))
              .withColumn("middleName",split(col("Name"),",").getItem(1))
              .withColumn("lastName",split(col("Name"),",").getItem(2))
              ) // get item means get index

splitDf.show()

+--------------------+--------+------+------+---------+----------+------------+
|                Name|DOB Year|Gender|Salary|firstName|middleName|    lastName|
+--------------------+--------+------+------+---------+----------+------------+
|Sharon, Zefanya, ...|    2003|     F| 10000|   Sharon|   Zefanya|    Setiawan|
|Lintang, Diah, Ay...|    2003|     F| 50000|  Lintang|      Diah| Ayuningtyas|
|Matthew, Aaron, S...|    2003|     M| 20000|  Matthew|     Aaron|   Sugiyarto|
+--------------------+--------+------+------+---------+----------+------------+



splitDf: org.apache.spark.sql.DataFrame = [Name: string, DOB Year: string ... 5 more fields]


# Round

In [55]:
val number = Seq(1.12, 13, 9.87, 2.34, 2.5, 3.5)
val numberDF = number.toDF("value")

number: Seq[Double] = List(1.12, 13.0, 9.87, 2.34, 2.5, 3.5)
numberDF: org.apache.spark.sql.DataFrame = [value: double]


In [57]:
numberDF.withColumn("Round",round(col("value"))).show()

+-----+-----+
|value|Round|
+-----+-----+
| 1.12|  1.0|
| 13.0| 13.0|
| 9.87| 10.0|
| 2.34|  2.0|
|  2.5|  3.0|
|  3.5|  4.0|
+-----+-----+



In [62]:
number.map(x => x.round)

res38: Seq[Long] = List(1, 13, 10, 2, 3, 4)


In [63]:
numberDF.withColumn("Round",round(col("value"),1)).show()

+-----+-----+
|value|Round|
+-----+-----+
| 1.12|  1.1|
| 13.0| 13.0|
| 9.87|  9.9|
| 2.34|  2.3|
|  2.5|  2.5|
|  3.5|  3.5|
+-----+-----+



# to_date Vs date_format
- to_date = mengubah date yang belum standar untuk mengikuti standar (yyyy-MM-dd)
- date_format = bisa mengubah format, jika formatnya sudah mengikuti standar (yyyy-MM-dd)


https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html

In [67]:
val datetimes = Seq((20201111,"11-Nov-2020 11:53:01.123"),
                   (20211212,"12-Dec-2021 10:52:05.131"),
                   (20220202,"02-Feb-2022 00:00:00.000"))

val df = datetimes.toDF("Date","Time")
df.show(false)

+--------+------------------------+
|Date    |Time                    |
+--------+------------------------+
|20201111|11-Nov-2020 11:53:01.123|
|20211212|12-Dec-2021 10:52:05.131|
|20220202|02-Feb-2022 00:00:00.000|
+--------+------------------------+



datetimes: Seq[(Int, String)] = List((20201111,11-Nov-2020 11:53:01.123), (20211212,12-Dec-2021 10:52:05.131), (20220202,02-Feb-2022 00:00:00.000))
df: org.apache.spark.sql.DataFrame = [Date: int, Time: string]


# to_date

In [71]:
df.withColumn("toDate", to_date(col("Date").cast("String"),"yyyyMMdd"))
.withColumn("toTime",to_timestamp(col("Time"),"dd-MMM-yyyy HH:mm:ss.SSS")
.show(false)

+--------+------------------------+----------+-----------------------+
|Date    |Time                    |toDate    |toTime                 |
+--------+------------------------+----------+-----------------------+
|20201111|11-Nov-2020 11:53:01.123|2020-11-11|2020-11-11 11:53:01.123|
|20211212|12-Dec-2021 10:52:05.131|2021-12-12|2021-12-12 10:52:05.131|
|20220202|02-Feb-2022 00:00:00.000|2022-02-02|2022-02-02 00:00:00    |
+--------+------------------------+----------+-----------------------+



dfNew: org.apache.spark.sql.DataFrame = [Date: int, Time: string ... 2 more fields]


# date_format

In [95]:
val datetimes = Seq(("2020-11-11","11-Nov-2020 11:53:01.123"),
                   ("2021-12-12","12-Dec-2021 10:52:05.131"),
                   ("2022-2-2","2-Feb-2022 00:00:00.000"))

val df = datetimes.toDF("Date","Time")
df.show(false)

+----------+------------------------+
|Date      |Time                    |
+----------+------------------------+
|2020-11-11|11-Nov-2020 11:53:01.123|
|2021-12-12|12-Dec-2021 10:52:05.131|
|2022-2-2  |2-Feb-2022 00:00:00.000 |
+----------+------------------------+



datetimes: Seq[(String, String)] = List((2020-11-11,11-Nov-2020 11:53:01.123), (2021-12-12,12-Dec-2021 10:52:05.131), (2022-2-2,2-Feb-2022 00:00:00.000))
df: org.apache.spark.sql.DataFrame = [Date: string, Time: string]


In [96]:
df.withColumn("toDate",date_format(col("Date"),"yyyy"))
// .withColumn("toTime",date_format(col("Time"),"yyyy"))
.show(false)

+----------+------------------------+------+
|Date      |Time                    |toDate|
+----------+------------------------+------+
|2020-11-11|11-Nov-2020 11:53:01.123|2020  |
|2021-12-12|12-Dec-2021 10:52:05.131|2021  |
|2022-2-2  |2-Feb-2022 00:00:00.000 |2022  |
+----------+------------------------+------+



In [87]:
val datetimes = Seq(("2020-11-11","2020-11-11 11:53:01.123"),
                   ("2021-12-12","2021-12-12 10:52:05.131"),
                   ("2022-2-2","2022-2-2 00:00:00.000"))

val df = datetimes.toDF("Date","Time")
df.show(false)

+----------+-----------------------+
|Date      |Time                   |
+----------+-----------------------+
|2020-11-11|2020-11-11 11:53:01.123|
|2021-12-12|2021-12-12 10:52:05.131|
|2022-2-2  |2022-2-2 00:00:00.000  |
+----------+-----------------------+



datetimes: Seq[(String, String)] = List((2020-11-11,2020-11-11 11:53:01.123), (2021-12-12,2021-12-12 10:52:05.131), (2022-2-2,2022-2-2 00:00:00.000))
df: org.apache.spark.sql.DataFrame = [Date: string, Time: string]


In [88]:
df.withColumn("toDate",date_format(col("Date"),"yyyyMMddHHmmss"))
.withColumn("toTime",date_format(col("Time"),"yyyyMMddHHmmss"))
.show(false)

+----------+-----------------------+--------------+--------------+
|Date      |Time                   |toDate        |toTime        |
+----------+-----------------------+--------------+--------------+
|2020-11-11|2020-11-11 11:53:01.123|20201111000000|20201111115301|
|2021-12-12|2021-12-12 10:52:05.131|20211212000000|20211212105205|
|2022-2-2  |2022-2-2 00:00:00.000  |20220202000000|20220202000000|
+----------+-----------------------+--------------+--------------+



In [89]:
df.withColumn("toDate",date_format(col("Date"),"MMMM d, yyyy"))
.show(false)

+----------+-----------------------+-----------------+
|Date      |Time                   |toDate           |
+----------+-----------------------+-----------------+
|2020-11-11|2020-11-11 11:53:01.123|November 11, 2020|
|2021-12-12|2021-12-12 10:52:05.131|December 12, 2021|
|2022-2-2  |2022-2-2 00:00:00.000  |February 2, 2022 |
+----------+-----------------------+-----------------+



In [97]:
df.withColumn("toDate",date_format(col("Date"),"E"))
.show(false)

+----------+------------------------+------+
|Date      |Time                    |toDate|
+----------+------------------------+------+
|2020-11-11|11-Nov-2020 11:53:01.123|Wed   |
|2021-12-12|12-Dec-2021 10:52:05.131|Sun   |
|2022-2-2  |2-Feb-2022 00:00:00.000 |Wed   |
+----------+------------------------+------+



# date_add

In [92]:
df.withColumn("addDate",date_add(col("Date"),5))
.withColumn("addTime", date_add(col("Time"),5))
.withColumn("subDate",date_sub(col("Date"),5))
.withColumn("subTime",date_sub(col("Time"),5))
.show(false)

+----------+-----------------------+----------+----------+----------+----------+
|Date      |Time                   |addDate   |addTime   |subDate   |subTime   |
+----------+-----------------------+----------+----------+----------+----------+
|2020-11-11|2020-11-11 11:53:01.123|2020-11-16|2020-11-16|2020-11-06|2020-11-06|
|2021-12-12|2021-12-12 10:52:05.131|2021-12-17|2021-12-17|2021-12-07|2021-12-07|
|2022-2-2  |2022-2-2 00:00:00.000  |2022-02-07|2022-02-07|2022-01-28|2022-01-28|
+----------+-----------------------+----------+----------+----------+----------+



# date_diff

In [94]:
df.withColumn("dateDifference",datediff(current_date(),col("Date")))
.withColumn("timeDifference",datediff(current_timestamp(),col("Time")))
.show(false)

+----------+-----------------------+--------------+--------------+
|Date      |Time                   |dateDifference|timeDifference|
+----------+-----------------------+--------------+--------------+
|2020-11-11|2020-11-11 11:53:01.123|729           |729           |
|2021-12-12|2021-12-12 10:52:05.131|333           |333           |
|2022-2-2  |2022-2-2 00:00:00.000  |281           |281           |
+----------+-----------------------+--------------+--------------+

