# Multi-table operations made simple with DiscoverX

In [0]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# 1️⃣ Products Table - 5 rows
products_data = [
    (101, "Laptop", "Electronics", 850.50),
    (102, "Book", "Education", 15.99),
    (103, "T-Shirt", "Clothing", 25.49),
    (104, "Phone", "Electronics", 699.00),
    (105, "Shoes", "Footwear", 49.99)
]
products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DoubleType(), True)
])
spark.createDataFrame(products_data, products_schema) \
    .write.mode("overwrite").format("delta").saveAsTable("delta_products")


# 2️⃣ Employees Table - 3 rows
employees_data = [
    (1, "Alice", "Engineering", 75000),
    (2, "Bob", "Sales", 55000),
    (3, "Charlie", "HR", 48000)
]
employees_schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True)
])
spark.createDataFrame(employees_data, employees_schema) \
    .write.mode("overwrite").format("delta").saveAsTable("delta_employees")


# 3️⃣ Sales Table - 10 rows
sales_data = [
    (1, 101, 2, "2024-01-01"),
    (2, 103, 5, "2024-02-10"),
    (3, 102, 1, "2024-03-05"),
    (4, 104, 3, "2024-03-15"),
    (5, 105, 2, "2024-03-20"),
    (6, 101, 1, "2024-03-22"),
    (7, 102, 4, "2024-03-25"),
    (8, 103, 2, "2024-04-01"),
    (9, 104, 1, "2024-04-02"),
    (10, 105, 6, "2024-04-04")
]
sales_schema = StructType([
    StructField("sale_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_date", StringType(), True)
])
spark.createDataFrame(sales_data, sales_schema) \
    .write.mode("overwrite").format("delta").saveAsTable("delta_sales")


# 4️⃣ Customers Table - 2 rows
customers_data = [
    (1001, "Daniel", "daniel@example.com", "India"),
    (1002, "Emma", "emma@example.com", "USA")
]
customers_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("country", StringType(), True)
])
spark.createDataFrame(customers_data, customers_schema) \
    .write.mode("overwrite").format("delta").saveAsTable("delta_customers")


# 5️⃣ Transactions Table - 7 rows
transactions_data = [
    ("TX100", 1001, 101, "2024-04-01", "Completed"),
    ("TX101", 1002, 103, "2024-04-03", "Pending"),
    ("TX102", 1001, 102, "2024-04-04", "Failed"),
    ("TX103", 1002, 104, "2024-04-05", "Completed"),
    ("TX104", 1001, 105, "2024-04-06", "Completed"),
    ("TX105", 1002, 101, "2024-04-07", "Pending"),
    ("TX106", 1001, 103, "2024-04-08", "Failed")
]
transactions_schema = StructType([
    StructField("txn_id", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("txn_date", StringType(), True),
    StructField("status", StringType(), True)
])
spark.createDataFrame(transactions_data, transactions_schema) \
    .write.mode("overwrite").format("delta").saveAsTable("delta_transactions")


In [0]:
%pip install dbl-discoverx


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()


In [0]:
from discoverx import DX
dx = DX()

In [0]:
from_tables = "workspace.default.*"
dx.from_tables(from_tables).with_sql("""SELECT COUNT(*) FROM {full_table_name}""").explain()


DiscoverX will apply the following SQL template

SELECT COUNT(*) FROM {full_table_name}

to the tables in the following catalog, schema, table combinations:
workspace.default.*

The SQL to be executed is (just a moment, generating it...):


In [0]:
table_counts=dx.from_tables(from_tables).with_sql("""SELECT COUNT(*) FROM {full_table_name}""").apply()
table_counts.display()


count(1),table_catalog,table_schema,table_name
7,workspace,default,delta_transactions
10,workspace,default,delta_sales
3,workspace,default,delta_employees
2,workspace,default,delta_customers
5,workspace,default,delta_products


In [0]:
dx.from_tables(from_tables).with_sql("""Drop table {full_table_name}""").explain()

DiscoverX will apply the following SQL template

Drop table {full_table_name}

to the tables in the following catalog, schema, table combinations:
workspace.default.*

The SQL to be executed is (just a moment, generating it...):


In [0]:
dx.from_tables(from_tables).with_sql("""Drop table {full_table_name}""").apply()

DataFrame[table_catalog: string, table_schema: string, table_name: string]