In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType



In [82]:
#df = spark.read.option("multiLine", "true").json("file:///data/data.json")
df = spark.read.json("file:///data/data.jsonl")

                                                                                

In [52]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- price: double (nullable = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- user_info: struct (nullable = true)
 |    |-- age: long (nullable = true)
 |    |-- contact: struct (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- phone: string (nullable = true)
 |    |-- name: string (nullable = true)



In [83]:
user_information = df.select(
    col("id"),
    col("user_info.name").alias("user_name"),
    col("user_info.age").alias("user_age"),
    col("orders")
)

In [68]:
user_information.show()

+---+-------------+--------+--------------------+
| id|    user_name|user_age|              orders|
+---+-------------+--------+--------------------+
|  1|User_PpdCAH8v|      48|[{ZER798, 1372.96...|
|  2|User_bdppMhcQ|      50|[{MTV175, 912.13,...|
|  3|User_Lgbgkvbe|      68|[{AC6933, 786.6, ...|
|  4|User_c6gnD7DK|      68|[{AZX833, 1293.44...|
|  5|User_KINdU4te|      31|[{FG2733, 1148.28...|
|  6|User_Bhwx9vKA|      56|[{EYO396, 709.79,...|
|  7|User_Dv4FxvtH|      18|[{BFV441, 818.65,...|
|  8|User_1zjpKmpx|      42|[{QFV408, 936.84,...|
|  9|User_ijguD6Aa|      56|[{RBK381, 494.84,...|
| 10|User_liRp8H21|      39|[{R1P967, 729.14,...|
| 11|User_xv366K0w|      57|[{DXZ600, 409.84,...|
| 12|User_7CuDM5DP|      33|[{RKZ848, 54.45, ...|
| 13|User_CqaLE0sc|      37|[{F7V397, 1199.31...|
| 14|User_lzNPGciA|      27|[{PYZ993, 1189.99...|
| 15|User_hhJZOCjC|      29|[{XYO994, 82.48, ...|
| 16|User_c3GcIhMD|      30|[{FN3260, 716.45,...|
| 17|User_Wm8ZH1Xk|      32|[{N4S181, 1063.28...|


In [84]:
contact_information = df.select(
    col("user_info.contact.email").alias("email"),
    col("user_info.contact.phone").alias("phone")
)


In [56]:
contact_information.show()

+---------------+------------+
|          email|       phone|
+---------------+------------+
|5trxz@oigfx.com|461-609-3880|
|hohkb@jnziw.com|407-509-7895|
|0gpmp@utdop.com|169-487-6063|
|qs1cm@jpfat.com|785-833-3742|
|zinat@pgulr.com|824-116-3284|
|jsoaq@sb6zm.com|832-808-9372|
|blm3x@ppluj.com|538-596-8053|
|ka5eu@fzybi.com|257-979-5267|
|oi8l8@6wpnm.com|671-367-4046|
|toouj@jmmbx.com|284-879-9564|
|vtenn@boddm.com|104-606-4677|
|1f6tu@fcgza.com|249-630-5977|
|uspjd@qsnej.com|511-859-9058|
|diad8@7ptgx.com|998-639-1253|
|5ffk1@mzlpd.com|165-358-9967|
|ppwzf@e3cie.com|970-650-2768|
|d2lj0@tbdci.com|709-239-2831|
|wmadc@mfvbv.com|461-723-3372|
|kecqf@mylon.com|292-745-3025|
|uxkj6@hucy3.com|138-737-3321|
+---------------+------------+
only showing top 20 rows



In [85]:
df_orders = user_information.withColumn("order", explode(col("orders")))

In [58]:
df_orders.printSchema()

root
 |-- id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- user_age: long (nullable = true)
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- order_id: string (nullable = true)
 |    |    |-- price: double (nullable = true)
 |    |    |-- product: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- product: string (nullable = true)
 |    |-- quantity: long (nullable = true)



In [86]:
df_orders = df_orders.select(
    col("order.order_id").alias("order_id"),
    col("order.product").alias("product"),
    col("order.quantity").alias("quantity")
)

In [60]:
df_orders.show()

+--------+----------+--------+
|order_id|   product|quantity|
+--------+----------+--------+
|  ZER798|   Monitor|       1|
|  MTV175|   Monitor|       1|
|  KAU818|Microphone|       5|
|  XXP895|Headphones|       1|
|  AC6933|   Speaker|       5|
|  HYF246|  Keyboard|       3|
|  AZX833|    Laptop|       3|
|  DHA249|Microphone|       2|
|  TFI728|    Camera|       1|
|  FG2733|Microphone|       1|
|  XAV777|Microphone|       3|
|  BDF204|     Mouse|       5|
|  EYO396|    Webcam|       5|
|  BFV441|    Camera|       1|
|  QFV408|     Mouse|       1|
|  0C7696|Headphones|       2|
|  D16143|   Printer|       5|
|  RBK381|Microphone|       2|
|  3IN825|Headphones|       4|
|  R1P967|   Speaker|       1|
+--------+----------+--------+
only showing top 20 rows



In [87]:
user_information = user_information.drop("orders")

In [78]:
user_information.show()

+---+-------------+--------+
| id|    user_name|user_age|
+---+-------------+--------+
|  1|User_PpdCAH8v|      48|
|  2|User_bdppMhcQ|      50|
|  3|User_Lgbgkvbe|      68|
|  4|User_c6gnD7DK|      68|
|  5|User_KINdU4te|      31|
|  6|User_Bhwx9vKA|      56|
|  7|User_Dv4FxvtH|      18|
|  8|User_1zjpKmpx|      42|
|  9|User_ijguD6Aa|      56|
| 10|User_liRp8H21|      39|
| 11|User_xv366K0w|      57|
| 12|User_7CuDM5DP|      33|
| 13|User_CqaLE0sc|      37|
| 14|User_lzNPGciA|      27|
| 15|User_hhJZOCjC|      29|
| 16|User_c3GcIhMD|      30|
| 17|User_Wm8ZH1Xk|      32|
| 18|User_pxTMgNcI|      54|
| 19|User_RPMmLdQ8|      61|
| 20|User_rzxtXsEN|      52|
+---+-------------+--------+
only showing top 20 rows



In [88]:
user_information.mode("overwrite").write.parquet("/data/user_information")
contact_information.mode("overwrite").write.parquet("/data/contact_information")
df_orders.write.mode("overwrite").parquet("/data/df_orders")




25/07/07 23:29:13 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/07/07 23:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/07/07 23:29:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [2]:
import json
import random
import string
import os
import time

def generate_random_string(length=10):
    """Generates a random string of specified length."""
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def generate_random_email():
    """Generates a random email address."""
    return f"{generate_random_string(5).lower()}@{generate_random_string(5).lower()}.com"

def generate_random_phone():
    """Generates a random phone number."""
    return f"{random.randint(100,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}"

def generate_random_order():
    """Generates a random order object."""
    products = ["Laptop", "Mouse", "Keyboard", "Monitor", "Webcam", "Headphones", "Microphone", "Printer", "Speaker", "Camera"]
    return {
        "order_id": generate_random_string(3).upper() + str(random.randint(100, 999)),
        "product": random.choice(products),
        "quantity": random.randint(1, 5),
        "price": round(random.uniform(10.0, 1500.0), 2) # Added a price field
    }

def generate_json_document(doc_id):
    """Generates a single random JSON document based on your provided structure."""
    num_orders = random.randint(1, 4) # Each user can have between 1 and 4 orders
    orders_list = [generate_random_order() for _ in range(num_orders)]

    return {
        "id": str(doc_id),
        "user_info": {
            "name": f"User_{generate_random_string(8)}",
            "age": random.randint(18, 70),
            "contact": {
                "email": generate_random_email(),
                "phone": generate_random_phone()
            }
        },
        "orders": orders_list,
        "registration_date": f"202{random.randint(0,4)}-{random.randint(1,12):02d}-{random.randint(1,28):02d}" # Random date
    }

# --- Main script to generate the file ---

file_name = "/data/data.jsonl" # Using .jsonl for JSON Lines format
num_documents = 1_000_000 # One million documents

print(f"Starting to generate {num_documents} JSON documents into '{file_name}'...")
start_time = time.time()

with open(file_name, 'w') as f:
    for i in range(1, num_documents + 1):
        doc = generate_json_document(i)
        f.write(json.dumps(doc) + '\n') # Write each JSON object on a new line

        if i % 100000 == 0: # Print progress every 100,000 documents
            print(f"  Generated {i:,} documents...")

end_time = time.time()
duration = end_time - start_time
print(f"Finished generating {num_documents} documents in {duration:.2f} seconds.")
print(f"The file '{file_name}' is ready in your current directory.")
print("\nRemember, this file will be quite large (potentially hundreds of MBs to a few GBs).")

Starting to generate 1000000 JSON documents into '/data/data.jsonl'...
  Generated 100,000 documents...
  Generated 200,000 documents...
  Generated 300,000 documents...
  Generated 400,000 documents...
  Generated 500,000 documents...
  Generated 600,000 documents...
  Generated 700,000 documents...
  Generated 800,000 documents...
  Generated 900,000 documents...
  Generated 1,000,000 documents...
Finished generating 1000000 documents in 23.89 seconds.
The file '/data/data.jsonl' is ready in your current directory.

Remember, this file will be quite large (potentially hundreds of MBs to a few GBs).


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("NestedJsonExtraction") \
    .getOrCreate()

# 1. Load the JSON data
# Use multiLine=True if your JSON objects span multiple lines
df = spark.read.option("multiLine", "true").json("data.json")

# Print the inferred schema to understand the structure
df.printSchema()
# Output will look something like this:
# root
#  |-- id: string (nullable = true)
#  |-- orders: array (nullable = true)
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- order_id: string (nullable = true)
#  |    |    |-- product: string (nullable = true)
#  |    |    |-- quantity: long (nullable = true)
#  |-- user_info: struct (nullable = true)
#  |    |-- contact: struct (nullable = true)
#  |    |    |-- email: string (nullable = true)
#  |    |    |-- phone: string (nullable = true)
#  |    |-- name: string (nullable = true)
#  |    |-- age: long (nullable = true)

# 2. Flatten the 'user_info' struct
# You can access nested fields directly using dot notation
df_flattened_user = df.select(
    col("id"),
    col("user_info.name").alias("user_name"),
    col("user_info.age").alias("user_age"),
    col("user_info.contact.email").alias("user_email"),
    col("user_info.contact.phone").alias("user_phone"),
    col("orders") # Keep the orders array for now
)

df_flattened_user.printSchema()
# root
#  |-- id: string (nullable = true)
#  |-- user_name: string (nullable = true)
#  |-- user_age: long (nullable = true)
#  |-- user_email: string (nullable = true)
#  |-- user_phone: string (nullable = true)
#  |-- orders: array (nullable = true)
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- order_id: string (nullable = true)
#  |    |    |-- product: string (nullable = true)
#  |    |    |-- quantity: long (nullable = true)

# 3. Flatten the 'orders' array using explode
# This will create a new row for each item in the 'orders' array
df_exploded_orders = df_flattened_user.withColumn("order", explode(col("orders")))

df_exploded_orders.printSchema()
# root
#  |-- id: string (nullable = true)
#  |-- user_name: string (nullable = true)
#  |-- user_age: long (nullable = true)
#  |-- user_email: string (nullable = true)
#  |-- user_phone: string (nullable = true)
#  |-- orders: array (nullable = true) <-- This column can now be dropped
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- order_id: string (nullable = true)
#  |    |    |-- product: string (nullable = true)
#  |    |    |-- quantity: long (nullable = true)
#  |-- order: struct (nullable = true) <-- New column representing a single order

# 4. Extract fields from the 'order' struct
df_final = df_exploded_orders.select(
    col("id"),
    col("user_name"),
    col("user_age"),
    col("user_email"),
    col("user_phone"),
    col("order.order_id").alias("order_id"),
    col("order.product").alias("product_name"),
    col("order.quantity").alias("product_quantity")
)

df_final.show()
# +---+---------+--------+-----------------+----------------+--------+------------+----------------+
# | id|user_name|user_age|       user_email|      user_phone|order_id|product_name|product_quantity|
# +---+---------+--------+-----------------+----------------+--------+------------+----------------+
# |  1|    Alice|      30|alice@example.com|123-456-7890|      A1|      Laptop|               1|
# |  1|    Alice|      30|alice@example.com|123-456-7890|      A2|       Mouse|               2|
# |  2|      Bob|      25|  bob@example.com|987-654-3210|      B1|    Keyboard|               1|
# +---+---------+--------+-----------------+----------------+--------+------------+----------------+

# Stop SparkSession
spark.stop()