In [4]:
# Show tables in the current database

spark.sql("SHOW TABLES").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|  default|teststocksymbols|      false|
+---------+----------------+-----------+



In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("TestStockSymbols") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Define the schema for the table
schema = "symbol STRING, name STRING, price DOUBLE, volume INT"

# Create a DataFrame with sample data
data = [
    ("AAPL", "Apple Inc.", 175.64, 3000000),
    ("MSFT", "Microsoft Corp.", 341.07, 2500000),
    ("GOOGL", "Alphabet Inc.", 2724.34, 1800000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Write DataFrame to Delta table in HDFS
df.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/testlakehouse/TestStockSymbols")

# Drop the existing table if it exists
spark.sql("DROP TABLE IF EXISTS TestStockSymbols")

# Register the Delta table in Spark SQL catalog
spark.sql("""
    CREATE TABLE TestStockSymbols
    USING DELTA
    LOCATION 'hdfs://namenode:8020/testlakehouse/TestStockSymbols'
""")


                                                                                

24/08/06 23:22:17 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `default`.`teststocksymbols` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


DataFrame[]

In [6]:
# Show tables in the current database

spark.sql("SHOW TABLES").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|  default|teststocksymbols|      false|
+---------+----------------+-----------+



In [7]:
# Query to select all records from the test_table
result = spark.sql("SELECT * FROM teststocksymbols")

# Show the results
result.show()

+------+---------------+-------+-------+
|symbol|           name|  price| volume|
+------+---------------+-------+-------+
|  MSFT|Microsoft Corp.| 341.07|2500000|
| GOOGL|  Alphabet Inc.|2724.34|1800000|
|  AAPL|     Apple Inc.| 175.64|3000000|
+------+---------------+-------+-------+



In [8]:
# Create a DataFrame with additional sample data
additional_data = [
    ("TSLA", "Tesla Inc.", 890.10, 2000000),
    ("AMZN", "Amazon.com Inc.", 139.68, 2200000),
    ("NVDA", "NVIDIA Corporation", 585.54, 1500000)
]

# Create DataFrame
additional_df = spark.createDataFrame(additional_data, schema=schema)

# Append data to the Delta table in HDFS
additional_df.write.format("delta").mode("append").save("hdfs://namenode:8020/testlakehouse/TestStockSymbols")


                                                                                

In [9]:
# Query to select all records from the test_table
result = spark.sql("SELECT * FROM teststocksymbols")

# Show the results
result.show()

+------+------------------+-------+-------+
|symbol|              name|  price| volume|
+------+------------------+-------+-------+
|  NVDA|NVIDIA Corporation| 585.54|1500000|
|  MSFT|   Microsoft Corp.| 341.07|2500000|
|  AMZN|   Amazon.com Inc.| 139.68|2200000|
| GOOGL|     Alphabet Inc.|2724.34|1800000|
|  AAPL|        Apple Inc.| 175.64|3000000|
|  TSLA|        Tesla Inc.|  890.1|2000000|
+------+------------------+-------+-------+



In [10]:
# Describe the table using Spark SQL
spark.sql("DESCRIBE teststocksymbols").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|         symbol|   string|       |
|           name|   string|       |
|          price|   double|       |
|         volume|      int|       |
|               |         |       |
| # Partitioning|         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [11]:
import pyspark.sql.functions as F

# Read Delta table
df = spark.read.format("delta").load("hdfs://namenode:8020/testlakehouse/TestStockSymbols")

# Generate random values for 'price' and 'volume'
df_with_random_values = df \
    .withColumn("price", F.round(F.rand() * 1000, 2)) \
    .withColumn("volume", F.round(F.rand() * 5000000).cast("int"))

# Overwrite the Delta table with new random values
df_with_random_values.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/testlakehouse/TestStockSymbols")

# Register the Delta table in Spark SQL catalog again
spark.sql("DROP TABLE IF EXISTS TestStockSymbols")
spark.sql("""
    CREATE TABLE TestStockSymbols
    USING DELTA
    LOCATION 'hdfs://namenode:8020/testlakehouse/TestStockSymbols'
""")

# Show the results
result = spark.sql("SELECT * FROM TestStockSymbols ORDER BY symbol ASC")
result.show()

                                                                                

24/08/06 23:23:25 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `default`.`teststocksymbols` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
+------+------------------+------+-------+
|symbol|              name| price| volume|
+------+------------------+------+-------+
|  AAPL|        Apple Inc.|745.72| 767925|
|  AMZN|   Amazon.com Inc.|860.83| 310173|
| GOOGL|     Alphabet Inc.|511.09|1973201|
|  MSFT|   Microsoft Corp.| 420.8|2969903|
|  NVDA|NVIDIA Corporation| 51.59|3710661|
|  TSLA|        Tesla Inc.|463.57|3072530|
+------+------------------+------+-------+



In [12]:
# Show all tables in the current database
tables = spark.sql("SHOW TABLES").select("tableName").rdd.flatMap(lambda x: x).collect()

# Describe each table
for table in tables:
    print(f"Describing table: {table}")
    description = spark.sql(f"DESCRIBE {table}")
    description.show(truncate=False)

Describing table: teststocksymbols
+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|symbol         |string   |       |
|name           |string   |       |
|price          |double   |       |
|volume         |int      |       |
|               |         |       |
|# Partitioning |         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [13]:
current_database = spark.sql("SELECT current_database()").collect()[0][0]
print(f"Current database: {current_database}")


Current database: default


In [14]:
# Show the results
result = spark.sql("SELECT * FROM TestStockSymbols ORDER BY symbol ASC")
result.show()

+------+------------------+------+-------+
|symbol|              name| price| volume|
+------+------------------+------+-------+
|  AAPL|        Apple Inc.|745.72| 767925|
|  AMZN|   Amazon.com Inc.|860.83| 310173|
| GOOGL|     Alphabet Inc.|511.09|1973201|
|  MSFT|   Microsoft Corp.| 420.8|2969903|
|  NVDA|NVIDIA Corporation| 51.59|3710661|
|  TSLA|        Tesla Inc.|463.57|3072530|
+------+------------------+------+-------+

