### Create Sample Dataframe

In [0]:
array_data = [
    ("Alice", 85, 90),
    ("Bob", 78, 88),
    ("Charlie", 92, 95),
    ("Diana", 80, 84),
    ("Ethan", 88, 91),
    ("Alice", 87, 93),
    ("Bob", 79, 85),
    ("Charlie", 90, 94),
    ("Diana", 82, 86),
    ("Ethan", 89, 92),
    ("Alice", 88, 91),
    ("Bob", 80, 87),
    ("Charlie", 91, 96),
    ("Diana", 83, 85),
    ("Ethan", 87, 90)
]
array_schema = ["Name", "Score_1", "Score_2"]
arrayDF = spark.createDataFrame(array_data, array_schema)
display(arrayDF)

Name,Score_1,Score_2
Alice,85,90
Bob,78,88
Charlie,92,95
Diana,80,84
Ethan,88,91
Alice,87,93
Bob,79,85
Charlie,90,94
Diana,82,86
Ethan,89,92


### Convert Sample Dataframe into Array Dataframe

In [0]:
from pyspark.sql import functions as F

masterDF = arrayDF.groupBy("Name").agg(F.collect_list('Score_1').alias('Array_Score_1'),F.collect_list('Score_2').alias('Array_Score_2'))
display(masterDF)
masterDF.printSchema()

Name,Array_Score_1,Array_Score_2
Alice,"List(85, 87, 88)","List(90, 93, 91)"
Charlie,"List(92, 90, 91)","List(95, 94, 96)"
Bob,"List(78, 79, 80)","List(88, 85, 87)"
Diana,"List(80, 82, 83)","List(84, 86, 85)"
Ethan,"List(88, 89, 87)","List(91, 92, 90)"


root
 |-- Name: string (nullable = true)
 |-- Array_Score_1: array (nullable = false)
 |    |-- element: long (containsNull = false)
 |-- Array_Score_2: array (nullable = false)
 |    |-- element: long (containsNull = false)



### Apply arrays_zip function on Array DF

In [0]:
arr_zip_df = masterDF.withColumn("Zipped_Value", F.arrays_zip("Array_Score_1", "Array_Score_2"))
display(arr_zip_df)

Name,Array_Score_1,Array_Score_2,Zipped_Value
Alice,"List(85, 87, 88)","List(90, 93, 91)","List(List(85, 90), List(87, 93), List(88, 91))"
Charlie,"List(92, 90, 91)","List(95, 94, 96)","List(List(92, 95), List(90, 94), List(91, 96))"
Bob,"List(78, 79, 80)","List(88, 85, 87)","List(List(78, 88), List(79, 85), List(80, 87))"
Diana,"List(80, 82, 83)","List(84, 86, 85)","List(List(80, 84), List(82, 86), List(83, 85))"
Ethan,"List(88, 89, 87)","List(91, 92, 90)","List(List(88, 91), List(89, 92), List(87, 90))"


In [0]:
display(masterDF.withColumn("Zipped_Value", F.arrays_zip("Array_Score_1")))

Name,Array_Score_1,Array_Score_2,Zipped_Value
Alice,"List(85, 87, 88)","List(90, 93, 91)","List(List(85), List(87), List(88))"
Charlie,"List(92, 90, 91)","List(95, 94, 96)","List(List(92), List(90), List(91))"
Bob,"List(78, 79, 80)","List(88, 85, 87)","List(List(78), List(79), List(80))"
Diana,"List(80, 82, 83)","List(84, 86, 85)","List(List(80), List(82), List(83))"
Ethan,"List(88, 89, 87)","List(91, 92, 90)","List(List(88), List(89), List(87))"


## Practical Use Case to Flatten Data using arrays_zip and explode

### Create a Sample Dataframe

In [0]:


sales_dept = [
    {"emp_name": "John Smith", "salary": 70000, "years_of_service": 5, "age": 32},
    {"emp_name": "Maria Garcia", "salary": 75000, "years_of_service": 6, "age": 29},
    {"emp_name": "James Miller", "salary": 68000, "years_of_service": 4, "age": 35}
]
hr_dept = [
    {"emp_name": "Linda Johnson", "salary": 65000, "years_of_service": 7, "age": 38},
    {"emp_name": "Robert Brown", "salary": 63000, "years_of_service": 3, "age": 30},
    {"emp_name": "Emily Clark", "salary": 67000, "years_of_service": 5, "age": 33}
]
empDF_data = [
    ("Sales_dept", sales_dept),
    ("Hr_dept", hr_dept)
]

empDF = spark.createDataFrame(empDF_data, ["Department", "Employee"])
display(empDF)


Department,Employee
Sales_dept,"List(Map(emp_name -> John Smith, salary -> 70000, years_of_service -> 5, age -> 32), Map(emp_name -> Maria Garcia, salary -> 75000, years_of_service -> 6, age -> 29), Map(emp_name -> James Miller, salary -> 68000, years_of_service -> 4, age -> 35))"
Hr_dept,"List(Map(emp_name -> Linda Johnson, salary -> 65000, years_of_service -> 7, age -> 38), Map(emp_name -> Robert Brown, salary -> 63000, years_of_service -> 3, age -> 30), Map(emp_name -> Emily Clark, salary -> 67000, years_of_service -> 5, age -> 33))"


### Apply arrays_zip

In [0]:
empDF_Zip= empDF.withColumn("Zip", F.arrays_zip("Employee"))
display(empDF_Zip)

# empDF_Zip= empDF.withColumn("Zip", F.arrays_zip(empDF["Employee"])) # alternate way

Department,Employee,Zip
Sales_dept,"List(Map(emp_name -> John Smith, salary -> 70000, years_of_service -> 5, age -> 32), Map(emp_name -> Maria Garcia, salary -> 75000, years_of_service -> 6, age -> 29), Map(emp_name -> James Miller, salary -> 68000, years_of_service -> 4, age -> 35))","List(List(Map(emp_name -> John Smith, salary -> 70000, years_of_service -> 5, age -> 32)), List(Map(emp_name -> Maria Garcia, salary -> 75000, years_of_service -> 6, age -> 29)), List(Map(emp_name -> James Miller, salary -> 68000, years_of_service -> 4, age -> 35)))"
Hr_dept,"List(Map(emp_name -> Linda Johnson, salary -> 65000, years_of_service -> 7, age -> 38), Map(emp_name -> Robert Brown, salary -> 63000, years_of_service -> 3, age -> 30), Map(emp_name -> Emily Clark, salary -> 67000, years_of_service -> 5, age -> 33))","List(List(Map(emp_name -> Linda Johnson, salary -> 65000, years_of_service -> 7, age -> 38)), List(Map(emp_name -> Robert Brown, salary -> 63000, years_of_service -> 3, age -> 30)), List(Map(emp_name -> Emily Clark, salary -> 67000, years_of_service -> 5, age -> 33)))"


### Apply Explode

In [0]:
empDF_exp= empDF_Zip.withColumn("Explode", F.explode(empDF_Zip.Zip)).drop("Employee", "Zip")
display(empDF_exp)

Department,Explode
Sales_dept,"List(Map(emp_name -> John Smith, salary -> 70000, years_of_service -> 5, age -> 32))"
Sales_dept,"List(Map(emp_name -> Maria Garcia, salary -> 75000, years_of_service -> 6, age -> 29))"
Sales_dept,"List(Map(emp_name -> James Miller, salary -> 68000, years_of_service -> 4, age -> 35))"
Hr_dept,"List(Map(emp_name -> Linda Johnson, salary -> 65000, years_of_service -> 7, age -> 38))"
Hr_dept,"List(Map(emp_name -> Robert Brown, salary -> 63000, years_of_service -> 3, age -> 30))"
Hr_dept,"List(Map(emp_name -> Emily Clark, salary -> 67000, years_of_service -> 5, age -> 33))"


### Flatten Fields from Exploded List

In [0]:
empDF_output = empDF_exp.withColumn("Employee_Name",empDF_exp['Explode.Employee.emp_name'])\
    .withColumn("Employee_Salary", empDF_exp['Explode.Employee.salary'])\
        .withColumn("Employee_Year_of_Service", empDF_exp['Explode.Employee.years_of_service'])\
            .withColumn("Employee_Age", empDF_exp['Explode.Employee.age'])\
                .drop("Employee", "Zip", "Explode")

display(empDF_output)

Department,Employee_Name,Employee_Salary,Employee_Year_of_Service,Employee_Age
Sales_dept,John Smith,70000,5,32
Sales_dept,Maria Garcia,75000,6,29
Sales_dept,James Miller,68000,4,35
Hr_dept,Linda Johnson,65000,7,38
Hr_dept,Robert Brown,63000,3,30
Hr_dept,Emily Clark,67000,5,33
