In [3]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Working with Strings & Dates")
    .master("local[*]")
    .getOrCreate()
)

spark

# Emp Data & Schema

emp_data = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"],
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

# Create emp DataFrame

emp = spark.createDataFrame(data=emp_data, schema=emp_schema)

# Show emp dataframe (ACTION)

emp.show()

# Print Schema

emp.printSchema()


+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|
|        010|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|
|        011|          104|   David Park| 38|  Male| 65000|2015-11-01|
|     

In [12]:
# Case When
# select employee_id, name, age, salary, gender,
# case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end as new_gender, hire_date from emp

from pyspark.sql.functions import col, expr, when

emp_gender_fixed = emp.withColumn("new_gender", when(col("gender") == 'Male', 'M')
                                      .when(col("gender") == 'Female', 'F')
                                      .otherwise(None))

emp_gender_fixed_1 = emp.withColumn("new_gender", expr("CASE when gender == 'Male' then 'M' when gender == 'Female' then 'F' else null end"))

emp_gender_fixed_1.show()

+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|
+-----------+-------------+-------------+---+------+------+----------+----------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|         F|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|         M|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|         F|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         M|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|         F|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|         M|
|        008|          102|     Kate Kim| 29|Female| 51000|2019-10-01|         F|
|        009|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|         M|
|        010|   

In [22]:
# Replace in Strings
# select employee_id, name, replace(name, 'J', 'Z') as new_name, age, salary, gender, new_gender, hire_date from emp_gender_fixed
from pyspark.sql.functions import regexp_replace

emp_name_fixed = emp_gender_fixed_1.withColumn("new_name", regexp_replace(col("name"), 'J', 'Z'))

emp_name_fixed.show()

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|     new_name|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|     Zohn Doe|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|         F|   Zane Smith|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|         M|    Bob Brown|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-09-30|         F|    Alice Lee|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|         M|    Zack Chan|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-07-01|         F|    Zill Wong|
|        007|          101|James Johnson| 42|  Male| 70000|2012-03-15|         M|Zames Zohnson|
|        008|          102|     Kate Kim

In [28]:
# Convert Date
# select *,  to_date(hire_date, 'YYYY-MM-DD') as hire_date from emp_name_fixed

from pyspark.sql.functions import to_date
emp_hire_date_fixed = emp_name_fixed.withColumn("hire_date", to_date(col("hire_date"), 'yyyy-mm-dd'))

emp_hire_date_fixed.show()
emp_hire_date_fixed.printSchema()

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|     new_name|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|     Zohn Doe|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|   Zane Smith|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|    Bob Brown|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|    Alice Lee|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|    Zack Chan|
|        006|          103|    Jill Wong| 32|Female| 52000|2018-01-01|         F|    Zill Wong|
|        007|          101|James Johnson| 42|  Male| 70000|2012-01-15|         M|Zames Zohnson|
|        008|          102|     Kate Kim

In [31]:
# Add Date Columns
# Add current_date, current_timestamp, extract year from hire_date
from pyspark.sql.functions import current_date, current_timestamp
emp_dated = emp_hire_date_fixed.withColumn("date_now", current_date()).withColumn("timestamp_now", current_timestamp())
emp_dated.show(truncate= False)

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+--------------------------+
|employee_id|department_id|name         |age|gender|salary|hire_date |new_gender|new_name     |date_now  |timestamp_now             |
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+--------------------------+
|001        |101          |John Doe     |30 |Male  |50000 |2015-01-01|M         |Zohn Doe     |2025-01-06|2025-01-06 18:34:44.417812|
|002        |101          |Jane Smith   |25 |Female|45000 |2016-01-15|F         |Zane Smith   |2025-01-06|2025-01-06 18:34:44.417812|
|003        |102          |Bob Brown    |35 |Male  |55000 |2014-01-01|M         |Bob Brown    |2025-01-06|2025-01-06 18:34:44.417812|
|004        |102          |Alice Lee    |28 |Female|48000 |2017-01-30|F         |Alice Lee    |2025-01-06|2025-01-06 18:34:44.417812|
|005        |103          |Jack Chan    |40 |Male  |60000 |201

In [35]:
# Drop Null gender records

emp_dated_1 = emp_dated.na.drop()
emp_dated_1.show()

+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+--------------------+
|employee_id|department_id|         name|age|gender|salary| hire_date|new_gender|     new_name|  date_now|       timestamp_now|
+-----------+-------------+-------------+---+------+------+----------+----------+-------------+----------+--------------------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|         M|     Zohn Doe|2025-01-06|2025-01-06 18:37:...|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-01-15|         F|   Zane Smith|2025-01-06|2025-01-06 18:37:...|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-01-01|         M|    Bob Brown|2025-01-06|2025-01-06 18:37:...|
|        004|          102|    Alice Lee| 28|Female| 48000|2017-01-30|         F|    Alice Lee|2025-01-06|2025-01-06 18:37:...|
|        005|          103|    Jack Chan| 40|  Male| 60000|2013-01-01|         M|    Zack Chan|2025-01-0

In [45]:
# Fix Null values
# select *, nvl('new_gender', 'O') as new_gender from emp_dated
from pyspark.sql.functions import coalesce, lit
emp_null_gender_df = emp.withColumn("new_gender", coalesce(col("gender"), lit("O")))
emp_null_gender_df.show(truncate=False)

+-----------+-------------+-------------+---+------+------+----------+----------+
|employee_id|department_id|name         |age|gender|salary|hire_date |new_gender|
+-----------+-------------+-------------+---+------+------+----------+----------+
|001        |101          |John Doe     |30 |Male  |50000 |2015-01-01|Male      |
|002        |101          |Jane Smith   |25 |Female|45000 |2016-02-15|Female    |
|003        |102          |Bob Brown    |35 |Male  |55000 |2014-05-01|Male      |
|004        |102          |Alice Lee    |28 |Female|48000 |2017-09-30|Female    |
|005        |103          |Jack Chan    |40 |Male  |60000 |2013-04-01|Male      |
|006        |103          |Jill Wong    |32 |Female|52000 |2018-07-01|Female    |
|007        |101          |James Johnson|42 |Male  |70000 |2012-03-15|Male      |
|008        |102          |Kate Kim     |29 |Female|51000 |2019-10-01|Female    |
|009        |103          |Tom Tan      |33 |Male  |58000 |2016-06-01|Male      |
|010        |104

In [48]:
# Drop old columns and Fix new column names

emp_drop_gender = emp_null_gender_df.drop("gender")
emp_drop_gender_1 = emp_drop_gender.withColumnRenamed("new_gender", "gender")
emp_drop_gender_1.show()

+-----------+-------------+-------------+---+------+----------+------+
|employee_id|department_id|         name|age|salary| hire_date|gender|
+-----------+-------------+-------------+---+------+----------+------+
|        001|          101|     John Doe| 30| 50000|2015-01-01|  Male|
|        002|          101|   Jane Smith| 25| 45000|2016-02-15|Female|
|        003|          102|    Bob Brown| 35| 55000|2014-05-01|  Male|
|        004|          102|    Alice Lee| 28| 48000|2017-09-30|Female|
|        005|          103|    Jack Chan| 40| 60000|2013-04-01|  Male|
|        006|          103|    Jill Wong| 32| 52000|2018-07-01|Female|
|        007|          101|James Johnson| 42| 70000|2012-03-15|  Male|
|        008|          102|     Kate Kim| 29| 51000|2019-10-01|Female|
|        009|          103|      Tom Tan| 33| 58000|2016-06-01|  Male|
|        010|          104|     Lisa Lee| 27| 47000|2018-08-01|Female|
|        011|          104|   David Park| 38| 65000|2015-11-01|  Male|
|     

In [49]:
# Write data as CSV
emp_drop_gender_1.write.mode("overwrite").format("csv").save("data/output/4/emp.csv")

In [51]:
# Bonus TIP
# Convert date into String and extract date information

from pyspark.sql.functions import date_format
emp_drop_gender_2 = emp_drop_gender_1.withColumn("date_year", date_format(col("hire_date"), "Z"))
emp_drop_gender_2.show()

+-----------+-------------+-------------+---+------+----------+------+---------+
|employee_id|department_id|         name|age|salary| hire_date|gender|date_year|
+-----------+-------------+-------------+---+------+----------+------+---------+
|        001|          101|     John Doe| 30| 50000|2015-01-01|  Male|    +0530|
|        002|          101|   Jane Smith| 25| 45000|2016-02-15|Female|    +0530|
|        003|          102|    Bob Brown| 35| 55000|2014-05-01|  Male|    +0530|
|        004|          102|    Alice Lee| 28| 48000|2017-09-30|Female|    +0530|
|        005|          103|    Jack Chan| 40| 60000|2013-04-01|  Male|    +0530|
|        006|          103|    Jill Wong| 32| 52000|2018-07-01|Female|    +0530|
|        007|          101|James Johnson| 42| 70000|2012-03-15|  Male|    +0530|
|        008|          102|     Kate Kim| 29| 51000|2019-10-01|Female|    +0530|
|        009|          103|      Tom Tan| 33| 58000|2016-06-01|  Male|    +0530|
|        010|          104| 