In [1]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [2]:
filename = './test_data/events.json'

df = spark.read.option("multiline","true").json(filename)
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- term: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- type: string (nullable = true)

+------+--------------------+-----------+--------------------+--------------+
|    id|             results|       term|      transaction_id|          type|
+------+--------------------+-----------+--------------------+--------------+
|  null|                null|      omnis|9e4bdbef-86fe-4ac...|        SEARCH|
|  null|[X00034, X00084, ...|       null|9e4bdbef-86fe-4ac...|SEARCH_RESULTS|
|X00022|                null|       null|9e4bdbef-86fe-4ac...|   REPORT_VIEW|
|X00084|                null|       null|9e4bdbef-86fe-4ac...|   REPORT_VIEW|
|  null|                null|      quasi|067bf59a-2f53-459...|        SEARCH|
|  null|[X00042, X00031, ...|       null|067bf59a-2f53-459...|SEARCH_RESULTS|
|  null|                null|   sapiente|e0bf0ee2-3b0a

In [3]:
event_schema = StructType(
        [
            StructField("id", StringType(), True),
            StructField("results", ArrayType(StringType(), True), True),
            StructField("term", StringType(), True),
            StructField("transaction_id", StringType(), True),
            StructField("type", StringType(), True),
        ]
    )

group_schema = StructType(
        [
            StructField("t_id", StringType(), True),
            StructField("events", ArrayType(event_schema, True), True),
        ]
    )

In [27]:
grouped = df.rdd.groupBy(lambda row: row['transaction_id']).mapValues(list)

def mapper(group):
    (t_id, rows) = group
    return (t_id, rows)

mapped = grouped.map(mapper)

res = mapped.collect()

print(type(res[0][1]))

print(res)

for t_id, group in grouped.collect():
    print(t_id)
    spark.createDataFrame(data=group, schema=event_schema).show()

<class 'list'>
[('9e4bdbef-86fe-4ace-aa61-d29a64bbb5f4', [Row(id=None, results=None, term='omnis', transaction_id='9e4bdbef-86fe-4ace-aa61-d29a64bbb5f4', type='SEARCH'), Row(id=None, results=['X00034', 'X00084', 'X00009', 'X00071'], term=None, transaction_id='9e4bdbef-86fe-4ace-aa61-d29a64bbb5f4', type='SEARCH_RESULTS'), Row(id='X00022', results=None, term=None, transaction_id='9e4bdbef-86fe-4ace-aa61-d29a64bbb5f4', type='REPORT_VIEW'), Row(id='X00084', results=None, term=None, transaction_id='9e4bdbef-86fe-4ace-aa61-d29a64bbb5f4', type='REPORT_VIEW')]), ('067bf59a-2f53-459b-8ad0-2cb4a9639519', [Row(id=None, results=None, term='quasi', transaction_id='067bf59a-2f53-459b-8ad0-2cb4a9639519', type='SEARCH'), Row(id=None, results=['X00042', 'X00031', 'X00095'], term=None, transaction_id='067bf59a-2f53-459b-8ad0-2cb4a9639519', type='SEARCH_RESULTS')]), ('e0bf0ee2-3b0a-4095-9483-28aecd5e1437', [Row(id=None, results=None, term='sapiente', transaction_id='e0bf0ee2-3b0a-4095-9483-28aecd5e1437',