In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("TestStockSymbols") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Define the schema for the table
schema = "symbol STRING, name STRING, price DOUBLE, volume INT"

# Create a DataFrame with sample data
data = [
    ("AAPL", "Apple Inc.", 175.64, 3000000),
    ("MSFT", "Microsoft Corp.", 341.07, 2500000),
    ("GOOGL", "Alphabet Inc.", 2724.34, 1800000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Write DataFrame to Delta table in HDFS
df.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/testlakehouse/TestStockSymbols")

# Register the Delta table in Spark SQL catalog
spark.sql("""
    CREATE TABLE TestStockSymbols
    USING DELTA
    LOCATION 'hdfs://namenode:8020/testlakehouse/TestStockSymbols'
""")


24/07/19 00:39:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

24/07/19 00:40:08 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

24/07/19 00:40:11 WARN ShellBasedUnixGroupsMapping: unable to return groups for user hdfs
PartialGroupNameException The user name 'hdfs' is not found. id: ‘hdfs’: no such user
id: ‘hdfs’: no such user

	at org.apache.hadoop.security.ShellBasedUnixGroupsMapping.resolvePartialGroupNames(ShellBasedUnixGroupsMapping.java:294)
	at org.apache.hadoop.security.ShellBasedUnixGroupsMapping.getUnixGroups(ShellBasedUnixGroupsMapping.java:207)
	at org.apache.hadoop.security.ShellBasedUnixGroupsMapping.getGroups(ShellBasedUnixGroupsMapping.java:97)
	at org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback.getGroups(JniBasedUnixGroupsMappingWithFallback.java:51)
	at org.apache.hadoop.security.Groups$GroupCacheLoader.fetchGroupList(Groups.java:387)
	at org.apache.hadoop.security.Groups$GroupCacheLoader.load(Groups.java:321)
	at org.apache.hadoop.security.Groups$GroupCacheLoader.load(Groups.java:270)
	at org.apache.hadoop.thirdparty.com.google.common.cache.LocalCache$LoadingValueReference.lo

DataFrame[]

In [2]:
# Show tables in the current database

spark.sql("SHOW TABLES").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|  default|teststocksymbols|      false|
|  default|      test_table|      false|
+---------+----------------+-----------+



In [3]:
# Query to select all records from the test_table
result = spark.sql("SELECT * FROM teststocksymbols")

# Show the results
result.show()

+------+---------------+-------+-------+
|symbol|           name|  price| volume|
+------+---------------+-------+-------+
|  MSFT|Microsoft Corp.| 341.07|2500000|
| GOOGL|  Alphabet Inc.|2724.34|1800000|
|  AAPL|     Apple Inc.| 175.64|3000000|
+------+---------------+-------+-------+



In [4]:
# Create a DataFrame with additional sample data
additional_data = [
    ("TSLA", "Tesla Inc.", 890.10, 2000000),
    ("AMZN", "Amazon.com Inc.", 139.68, 2200000),
    ("NVDA", "NVIDIA Corporation", 585.54, 1500000)
]

# Create DataFrame
additional_df = spark.createDataFrame(additional_data, schema=schema)

# Append data to the Delta table in HDFS
additional_df.write.format("delta").mode("append").save("hdfs://namenode:8020/testlakehouse/TestStockSymbols")


                                                                                

In [5]:
# Query to select all records from the test_table
result = spark.sql("SELECT * FROM teststocksymbols")

# Show the results
result.show()

+------+------------------+-------+-------+
|symbol|              name|  price| volume|
+------+------------------+-------+-------+
|  NVDA|NVIDIA Corporation| 585.54|1500000|
|  AMZN|   Amazon.com Inc.| 139.68|2200000|
|  MSFT|   Microsoft Corp.| 341.07|2500000|
| GOOGL|     Alphabet Inc.|2724.34|1800000|
|  AAPL|        Apple Inc.| 175.64|3000000|
|  TSLA|        Tesla Inc.|  890.1|2000000|
+------+------------------+-------+-------+



In [7]:
# Describe the table using Spark SQL
spark.sql("DESCRIBE teststocksymbols").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|         symbol|   string|       |
|           name|   string|       |
|          price|   double|       |
|         volume|      int|       |
|               |         |       |
| # Partitioning|         |       |
|Not partitioned|         |       |
+---------------+---------+-------+

