<img src = "https://github.com/singlestore-labs/spaces-notebooks/blob/e551e274bb67bb1e5081131ee1150cdba713fc43/common/images/singlestore-jupyter.png?raw=true">

<div id="singlestore-header" style="display: flex; background-color: rgba(235, 249, 245, 0.25); padding: 5px;">
    <div id="icon-image" style="width: 90px; height: 90px;">
        <img width="100%" height="100%" src="https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/browser.png" />
    </div>
    <div id="text" style="padding: 5px; margin-left: 10px;">
        <div id="badge" style="display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%">SingleStore Notebooks</div>
        <h1 style="font-weight: 500; margin: 8px 0 0 4px;">Using SingleStore for Iceberg Catalog Storage</h1>
    </div>
</div>

In [5]:
!pip cache purge --quiet

[0m

In [6]:
!conda install -y --quiet -c conda-forge openjdk

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [7]:
!pip install pyspark --quiet

In [8]:
import os

os.makedirs("warehouse", exist_ok = True)

In [9]:
from pyspark.sql import SparkSession

# List of Maven coordinates for all required packages
maven_packages = [
    "com.singlestore:singlestore-jdbc-client:1.2.3",
    "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2"
]

# Create Spark session with all required packages
spark = (SparkSession
             .builder
             .config("spark.jars.packages", ",".join(maven_packages))
             .appName("Spark Iceberg Catalog Test")
             .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
com.singlestore#singlestore-jdbc-client added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-473f66c3-3f39-4c47-810c-88883c041389;1.0
	confs: [default]
	found com.singlestore#singlestore-jdbc-client;1.2.3 in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.5.2/iceberg-spark-runtime-3.5_2.12-1.5.2.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2!iceberg-spark-runtime-3.5_2.12.jar (589ms)
:: resolution report :: resolve 704ms :: artifacts dl 593ms
	:: modules in use:
	com.singlestore#singlestore-jdbc-client;1.2.3 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.2 from central in [defaul

In [12]:
import pandas as pd

url = "https://gist.githubusercontent.com/VeryFatBoy/9af771d443f5ec4dd6eec8d69a062638/raw/c03ef25a97f23a48ee408ac02114195b663a2364/iris.csv"

pandas_df = pd.read_csv(url)

In [13]:
print(pandas_df.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [14]:
iris_df = spark.createDataFrame(pandas_df)

In [15]:
iris_df.show(5)

                                                                                

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [16]:
%%sql
DROP DATABASE IF EXISTS iceberg;
CREATE DATABASE IF NOT EXISTS iceberg;

<div class="alert alert-block alert-warning">
    <b class="fa fa-solid fa-exclamation-circle"></b>
    <div>
        <p><b>Action Required</b></p>
        <p>Select the database from the drop-down menu at the top of this notebook. It updates the <b>connection_url</b> which is used by SQLAlchemy to make connections to the selected database.</p>
    </div>
</div>

In [18]:
from sqlalchemy import *

db_connection = create_engine(connection_url)
url = db_connection.url

In [19]:
from singlestoredb.management import get_secret

password = get_secret("password")

In [20]:
spark.conf.set("spark.sql.catalog.s2_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.s2_catalog.type", "jdbc")
spark.conf.set("spark.sql.catalog.s2_catalog.warehouse", "warehouse")

# SSL/TLS configuration
spark.conf.set("spark.sql.catalog.s2_catalog.jdbc.useSSL", "true")
spark.conf.set("spark.sql.catalog.s2_catalog.jdbc.trustServerCertificate", "true")

# JDBC connection URL
spark.conf.set("spark.sql.catalog.s2_catalog.uri", f"jdbc:singlestore://{url.host}:{url.port}/{url.database}")

# JDBC credentials
spark.conf.set("spark.sql.catalog.s2_catalog.jdbc.user", "admin")
spark.conf.set("spark.sql.catalog.s2_catalog.jdbc.password", password)

In [21]:
spark.sql("""
    DROP TABLE IF EXISTS s2_catalog.db.iris
""")

DataFrame[]

In [22]:
%%sql
SHOW TABLES;

Tables_in_iceberg
iceberg_namespace_properties
iceberg_tables


In [23]:
(iris_df.write
    .format("iceberg")
    .partitionBy("species")
    .save("s2_catalog.db.iris")
)

                                                                                

In [24]:
spark.sql("""
    SELECT file_path, file_format, partition, record_count
    FROM s2_catalog.db.iris.files
""").show()

+--------------------+-----------+-----------------+------------+
|           file_path|file_format|        partition|record_count|
+--------------------+-----------+-----------------+------------+
|warehouse/db/iris...|    PARQUET| {Iris-virginica}|          50|
|warehouse/db/iris...|    PARQUET|    {Iris-setosa}|          50|
|warehouse/db/iris...|    PARQUET|{Iris-versicolor}|          50|
+--------------------+-----------+-----------------+------------+



In [25]:
spark.sql("""
    SELECT * FROM s2_catalog.db.iris LIMIT 5
""").show()

+------------+-----------+------------+-----------+--------------+
|sepal_length|sepal_width|petal_length|petal_width|       species|
+------------+-----------+------------+-----------+--------------+
|         6.3|        3.3|         6.0|        2.5|Iris-virginica|
|         5.8|        2.7|         5.1|        1.9|Iris-virginica|
|         7.1|        3.0|         5.9|        2.1|Iris-virginica|
|         6.3|        2.9|         5.6|        1.8|Iris-virginica|
|         6.5|        3.0|         5.8|        2.2|Iris-virginica|
+------------+-----------+------------+-----------+--------------+



                                                                                

In [26]:
spark.sql("""
    DELETE FROM s2_catalog.db.iris
    WHERE species = 'Iris-virginica'
""")

DataFrame[]

In [27]:
spark.sql("""
    SELECT file_path, file_format, partition, record_count
    FROM s2_catalog.db.iris.files
""").show()

+--------------------+-----------+-----------------+------------+
|           file_path|file_format|        partition|record_count|
+--------------------+-----------+-----------------+------------+
|warehouse/db/iris...|    PARQUET|    {Iris-setosa}|          50|
|warehouse/db/iris...|    PARQUET|{Iris-versicolor}|          50|
+--------------------+-----------+-----------------+------------+



In [28]:
spark.stop()

In [29]:
%%sql
SELECT * FROM iceberg_namespace_properties;

catalog_name,namespace,property_key,property_value


In [30]:
%%sql
SELECT * FROM iceberg_tables;

catalog_name,table_namespace,table_name,metadata_location,previous_metadata_location
s2_catalog,db,iris,warehouse/db/iris/metadata/00001-6ea55045-6162-4462-9f8c-597ddbc5b846.metadata.json,warehouse/db/iris/metadata/00000-39743969-9e4b-4875-81ad-d8310656d28f.metadata.json
