<a href="https://colab.research.google.com/github/VictoriaUsman/Big-Data/blob/main/Pyspark_Handling_DataFrame_Dates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, trim, to_date

spark = SparkSession.builder.appName("DateConversion").getOrCreate()

In [6]:
data = [
    (1, "2023-01-15"),        # ISO format
    (2, "15/01/2023"),        # dd/MM/yyyy
    (3, "01-15-2023"),        # MM-dd-yyyy
    (4, "January 15 2023"),   # Full month name
    (5, "15 Jan 23"),         # Short month + 2-digit year
    (6, " 15 Jan 23 "),       # Leading & trailing spaces
    (7, "15 Jan 23|"),        # Trailing junk character
    (8, "2023/01/15"),        # yyyy/MM/dd
    (9, "15.01.2023"),        # dd.MM.yyyy
    (10, "2023-13-20"),       # Invalid month
    (11, "32 Jan 2023"),      # Invalid day
    (12, "Feb 30 2023"),      # Impossible date
    (13, ""),                 # Empty string
    (14, None),               # Null value
    (15, "20230115"),         # yyyyMMdd
]

schema = ["id", "raw_date"]

df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+---+---------------+
|id |raw_date       |
+---+---------------+
|1  |2023-01-15     |
|2  |15/01/2023     |
|3  |01-15-2023     |
|4  |January 15 2023|
|5  |15 Jan 23      |
|6  | 15 Jan 23     |
|7  |15 Jan 23|     |
|8  |2023/01/15     |
|9  |15.01.2023     |
|10 |2023-13-20     |
|11 |32 Jan 2023    |
|12 |Feb 30 2023    |
|13 |               |
|14 |NULL           |
|15 |20230115       |
+---+---------------+



In [9]:
from pyspark.sql.functions import (
    trim, regexp_replace, coalesce,
    try_to_timestamp, to_date, col, lit
)

df_clean = df.withColumn(
    "clean_date",
    to_date(
        coalesce(
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("yyyy-MM-dd")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("dd/MM/yyyy")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("MM-dd-yyyy")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("MMMM dd yyyy")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("dd MMM yy")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("yyyy/MM/dd")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("dd.MM.yyyy")),
            try_to_timestamp(regexp_replace(trim(col("raw_date")), r"\|", ""), lit("yyyyMMdd"))
        )
    )
)


In [10]:
df_clean.show()

+---+---------------+----------+
| id|       raw_date|clean_date|
+---+---------------+----------+
|  1|     2023-01-15|2023-01-15|
|  2|     15/01/2023|2023-01-15|
|  3|     01-15-2023|2023-01-15|
|  4|January 15 2023|2023-01-15|
|  5|      15 Jan 23|2023-01-15|
|  6|     15 Jan 23 |2023-01-15|
|  7|     15 Jan 23||2023-01-15|
|  8|     2023/01/15|2023-01-15|
|  9|     15.01.2023|2023-01-15|
| 10|     2023-13-20|      NULL|
| 11|    32 Jan 2023|      NULL|
| 12|    Feb 30 2023|      NULL|
| 13|               |      NULL|
| 14|           NULL|      NULL|
| 15|       20230115|2023-01-15|
+---+---------------+----------+



In [None]:
spark.stop()