### Create a sample Dataframe

In [0]:
employee_data = [
    ('10', 'John Smith', '2022-01-15', 'D01', 75000),
    ('11', 'Maria Garcia', '2021-07-10', 'D02', 82000),
    ('12', 'James Miller', '2020-03-22', 'D03', 91000),
    ('13', 'Linda Johnson', '2019-11-05', 'D01', 67000),
    ('14', 'Robert Brown', '2023-05-30', 'D02', 88000)
]
employee_schema = ['employee_id', 'name', 'doj', 'employee_dept_id', 'salary']

empDF= spark.createDataFrame(employee_data, employee_schema)
display(empDF)

employee_id,name,doj,employee_dept_id,salary
10,John Smith,2022-01-15,D01,75000
11,Maria Garcia,2021-07-10,D02,82000
12,James Miller,2020-03-22,D03,91000
13,Linda Johnson,2019-11-05,D01,67000
14,Robert Brown,2023-05-30,D02,88000


### First Method Of Split

In [0]:
from pyspark.sql.functions import split

# df1 = empDF.withColumn('First_Name', split(empDF['name'], ' ').getItem(0)) \
#     .withColumn('Last_Name', split(empDF['name'], ' ').getItem(1)).drop("name")

df1 = empDF.withColumn('First_Name', split("name", ' ').getItem(0)) \
    .withColumn('Last_Name', split("name", ' ').getItem(1))

display(df1)

employee_id,name,doj,employee_dept_id,salary,First_Name,Last_Name
10,John Smith,2022-01-15,D01,75000,John,Smith
11,Maria Garcia,2021-07-10,D02,82000,Maria,Garcia
12,James Miller,2020-03-22,D03,91000,James,Miller
13,Linda Johnson,2019-11-05,D01,67000,Linda,Johnson
14,Robert Brown,2023-05-30,D02,88000,Robert,Brown


In [0]:
display(empDF.withColumn('First_Name', split('name', ' ').getItem(0)) \
    .withColumn('Last_Name', split("name", ' ').getItem(1)).drop('name'))

employee_id,doj,employee_dept_id,salary,First_Name,Last_Name
10,2022-01-15,D01,75000,John,Smith
11,2021-07-10,D02,82000,Maria,Garcia
12,2020-03-22,D03,91000,James,Miller
13,2019-11-05,D01,67000,Linda,Johnson
14,2023-05-30,D02,88000,Robert,Brown


### Second Method of Split

In [0]:
import pyspark
split_col = pyspark.sql.functions.split(empDF['name'], ' ')

df2 = empDF.withColumn('First_Name', split_col.getItem(0)) \
    .withColumn('Last_Name', split_col.getItem(1))

display(df2)

employee_id,name,doj,employee_dept_id,salary,First_Name,Last_Name
10,John Smith,2022-01-15,D01,75000,John,Smith
11,Maria Garcia,2021-07-10,D02,82000,Maria,Garcia
12,James Miller,2020-03-22,D03,91000,James,Miller
13,Linda Johnson,2019-11-05,D01,67000,Linda,Johnson
14,Robert Brown,2023-05-30,D02,88000,Robert,Brown


In [0]:
display(empDF.withColumn('First_Name', split_col.getItem(0)) \
    .withColumn('Last_Name', split_col.getItem(1)).drop('name'))

employee_id,doj,employee_dept_id,salary,First_Name,Last_Name
10,2022-01-15,D01,75000,John,Smith
11,2021-07-10,D02,82000,Maria,Garcia
12,2020-03-22,D03,91000,James,Miller
13,2019-11-05,D01,67000,Linda,Johnson
14,2023-05-30,D02,88000,Robert,Brown


### Third Method of Split

In [0]:
split_col = pyspark.sql.functions.split(empDF['doj'], '-')

df3 = empDF.select("employee_id", "name", "employee_dept_id", "salary",
                   split_col.getItem(0).alias("Year_of_Joining"), split_col.getItem(1).alias("Month_of_Joining"), split_col.getItem(2).alias("Day_of_Joining"))
display(df3)

employee_id,name,employee_dept_id,salary,Year_of_Joining,Month_of_Joining,Day_of_Joining
10,John Smith,D01,75000,2022,1,15
11,Maria Garcia,D02,82000,2021,7,10
12,James Miller,D03,91000,2020,3,22
13,Linda Johnson,D01,67000,2019,11,5
14,Robert Brown,D02,88000,2023,5,30


### Combine Multiple Splits

In [0]:
df4 = empDF.withColumn('First_Name', split('name', ' ').getItem(0))\
    .withColumn('Last_Name', split('name', ' ').getItem(1))\
        .withColumn('Year_of_Joining', split('doj', '-').getItem(0))\
            .withColumn('Year_of_Month', split('doj', '-').getItem(1))\
                .withColumn('Year_of_Day', split('doj', '-').getItem(2))\
                    .drop('name', 'doj') # dropping splitted columns here

display(df4)

employee_id,employee_dept_id,salary,First_Name,Last_Name,Year_of_Joining,Year_of_Month,Year_of_Day
10,D01,75000,John,Smith,2022,1,15
11,D02,82000,Maria,Garcia,2021,7,10
12,D03,91000,James,Miller,2020,3,22
13,D01,67000,Linda,Johnson,2019,11,5
14,D02,88000,Robert,Brown,2023,5,30
