In [56]:
from uuid import uuid4
import random
import json
import os
from datetime import datetime, timedelta
import itertools
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructField,
    StructType,
    MapType,
    ArrayType,
    BooleanType,
    StringType,
    IntegerType,
    TimestampType,
)

In [57]:
NUM_TRANSACTIONS = 10
NUM_EVENTS = 3
START = datetime.fromtimestamp(1678772000.0000000000) + timedelta(hours=-10)
LIMIT = 70
simple_events = "./test_data/simple_events.json"
simple_events_grouped = "./test_data/simple_events_grouped.json"
simple_events_grouped_final = "./test_data/simple_events_grouped_final.json"

events = []

counter = itertools.count()

def write_file(path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(json.dumps(events, indent=4))
        print(f"Done: {path}")

def generate_event(id):
    n = next(counter)
    t = START + timedelta(minutes=n)
    return {
        "transaction_id": id,
        "timestamp": t.timestamp(),
        "id": str(uuid4())[0:6]
    }


for n in range(NUM_TRANSACTIONS):
    transaction_id = str(uuid4())[0:6]

    for i in range(NUM_EVENTS):
        event = generate_event(transaction_id)
        events.append(event)
        
        
write_file(simple_events)

Done: ./test_data/simple_events.json


In [58]:
spark = SparkSession.builder.getOrCreate()
events_df = spark.read.option("multiline", True).json(simple_events)
print(events_df.count())
events_df.show(5)

30
+------+------------+--------------+
|    id|   timestamp|transaction_id|
+------+------------+--------------+
|0a13b9|  1.678736E9|        f4ff85|
|58ca97|1.67873606E9|        f4ff85|
|77c899|1.67873612E9|        f4ff85|
|d668f0|1.67873618E9|        45efca|
|7e8be0|1.67873624E9|        45efca|
+------+------------+--------------+
only showing top 5 rows



In [59]:
transactions_df = (
    events_df.groupBy("transaction_id").agg(
        F.min("timestamp").alias("transaction_start"),
        F.max("timestamp").alias("transaction_end"),
        F.lit(LIMIT).alias("limit"),
        F.collect_list(
            F.struct(
                F.col("timestamp"),
                F.col("id"),
            )
        ).alias("events"),
    )
)

print(transactions_df.count())
transactions_df.show(5)

10
+--------------+-----------------+---------------+-----+--------------------+
|transaction_id|transaction_start|transaction_end|limit|              events|
+--------------+-----------------+---------------+-----+--------------------+
|        e9d0bc|     1.67873744E9|   1.67873756E9|   70|[{1.67873744E9, 2...|
|        740d82|     1.67873762E9|   1.67873774E9|   70|[{1.67873762E9, 2...|
|        80bb8a|     1.67873726E9|   1.67873738E9|   70|[{1.67873726E9, 1...|
|        45efca|     1.67873618E9|    1.6787363E9|   70|[{1.67873618E9, d...|
|        272929|      1.6787369E9|   1.67873702E9|   70|[{1.6787369E9, d8...|
+--------------+-----------------+---------------+-----+--------------------+
only showing top 5 rows



In [60]:
transactions_df.toPandas().to_json(simple_events_grouped, orient='records', force_ascii=False, indent=4)

In [61]:
@F.udf(returnType=ArrayType(StringType()))
def reports_viewed_within_limit(events, start, limit):
    max_time = datetime.fromtimestamp(start) + timedelta(seconds=limit)
    filtered = list(filter(lambda event: datetime.fromtimestamp(event["timestamp"]) <= max_time, events))
    ids = list(map(lambda event: event["id"], filtered))
    return ids

transactions_df = transactions_df.withColumn("reports_viewed_within_limit", reports_viewed_within_limit("events", "transaction_start", "limit")).drop("events")
transactions_df.show(n=4, truncate=False)

+--------------+-----------------+---------------+-----+---------------------------+
|transaction_id|transaction_start|transaction_end|limit|reports_viewed_within_limit|
+--------------+-----------------+---------------+-----+---------------------------+
|e9d0bc        |1.67873744E9     |1.67873756E9   |70   |[21513b, 0bfb96]           |
|740d82        |1.67873762E9     |1.67873774E9   |70   |[2d695e, 92099a]           |
|80bb8a        |1.67873726E9     |1.67873738E9   |70   |[111b3a, 799a5d]           |
|45efca        |1.67873618E9     |1.6787363E9    |70   |[d668f0, 7e8be0]           |
+--------------+-----------------+---------------+-----+---------------------------+
only showing top 4 rows



In [62]:
transactions_df.toPandas().to_json(simple_events_grouped_final, orient='records', force_ascii=False, indent=4)
transactions_df.show(n=4, truncate=False)

+--------------+-----------------+---------------+-----+---------------------------+
|transaction_id|transaction_start|transaction_end|limit|reports_viewed_within_limit|
+--------------+-----------------+---------------+-----+---------------------------+
|e9d0bc        |1.67873744E9     |1.67873756E9   |70   |[21513b, 0bfb96]           |
|740d82        |1.67873762E9     |1.67873774E9   |70   |[2d695e, 92099a]           |
|80bb8a        |1.67873726E9     |1.67873738E9   |70   |[111b3a, 799a5d]           |
|45efca        |1.67873618E9     |1.6787363E9    |70   |[d668f0, 7e8be0]           |
+--------------+-----------------+---------------+-----+---------------------------+
only showing top 4 rows

