<img src = "https://github.com/VeryFatBoy/notebooks/blob/main/common/images/img_github_singlestore-jupyter_featured_2.png?raw=true">

<div id="singlestore-header" style="display: flex; background-color: rgba(235, 249, 245, 0.25); padding: 5px;">
    <div id="icon-image" style="width: 90px; height: 90px;">
        <img width="100%" height="100%" src="https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/browser.png" />
    </div>
    <div id="text" style="padding: 5px; margin-left: 10px;">
        <div id="badge" style="display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%">SingleStore Notebooks</div>
        <h1 style="font-weight: 500; margin: 8px 0 0 4px;">Using SingleStoreDB with Delta Lake</h1>
    </div>
</div>

In [4]:
!pip cache purge --quiet

In [5]:
!conda install -y --quiet -c conda-forge openjdk

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [6]:
!pip install delta-spark --quiet
!pip install pyspark --quiet

In [7]:
import os
import pandas as pd
import requests
import shutil

from delta import *
from pyspark.sql import SparkSession
from singlestoredb.management import get_secret

In [8]:
os.makedirs("jars", exist_ok = True)
os.makedirs("warehouse", exist_ok = True)

In [9]:
def download_jar(url, destination):
    response = requests.get(url)
    with open(destination, "wb") as f:
        f.write(response.content)

jar_urls = [
    ("https://repo1.maven.org/maven2/com/singlestore/singlestore-jdbc-client/1.2.4/singlestore-jdbc-client-1.2.4.jar", "jars/singlestore-jdbc-client-1.2.4.jar"),
    ("https://repo1.maven.org/maven2/com/singlestore/singlestore-spark-connector_2.12/4.1.8-spark-3.5.0/singlestore-spark-connector_2.12-4.1.8-spark-3.5.0.jar", "jars/singlestore-spark-connector_2.12-4.1.8-spark-3.5.0.jar"),
    ("https://repo1.maven.org/maven2/org/apache/commons/commons-dbcp2/2.12.0/commons-dbcp2-2.12.0.jar", "jars/commons-dbcp2-2.12.0.jar"),
    ("https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar", "jars/commons-pool2-2.12.0.jar"),
    ("https://repo1.maven.org/maven2/io/spray/spray-json_3/1.3.6/spray-json_3-1.3.6.jar", "jars/spray-json_3-1.3.6.jar")
]

for url, destination in jar_urls:
    download_jar(url, destination)

print("JAR files downloaded successfully")

JAR files downloaded successfully


In [10]:
# Create Spark session with Delta Lake integration
builder = (SparkSession.builder
             .config("spark.jars", ",".join([destination for _, destination in jar_urls]))
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
             .appName("Spark Delta Lake Test")
)

# Use configure_spark_with_delta_pip to integrate Delta
spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5c69f557-300e-49c7-9770-e0502167c50a;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.1 in central
	found io.delta#delta-storage;3.2.1 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 199ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.1 from central in [default]
	io.delta#delta-storage;3.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0

In [11]:
url = "https://gist.githubusercontent.com/VeryFatBoy/9af771d443f5ec4dd6eec8d69a062638/raw/c03ef25a97f23a48ee408ac02114195b663a2364/iris.csv"

iris_df = pd.read_csv(url)

In [12]:
iris_df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [14]:
%%sql
DROP DATABASE IF EXISTS iris_db;
CREATE DATABASE IF NOT EXISTS iris_db;

<div class="alert alert-block alert-warning">
    <b class="fa fa-solid fa-exclamation-circle"></b>
    <div>
        <p><b>Action Required</b></p>
        <p>Select the database from the drop-down menu at the top of this notebook. It updates the <b>connection_url</b> which is used by SQLAlchemy to make connections to the selected database.</p>
    </div>
</div>

In [16]:
from sqlalchemy import *

db_connection = create_engine(connection_url)
url = db_connection.url

In [17]:
iris_df.to_sql(
    "iris",
    con = db_connection,
    if_exists = "replace",
    index = False,
    chunksize = 1000
)

150

In [18]:
password = get_secret("password")
host = url.host
port = url.port
cluster = host + ":" + str(port)

In [19]:
spark.conf.set("spark.datasource.singlestore.ddlEndpoint", cluster)
spark.conf.set("spark.datasource.singlestore.user", "admin")
spark.conf.set("spark.datasource.singlestore.password", password)
spark.conf.set("spark.datasource.singlestore.disablePushdown", "false")

In [20]:
iris_df = (spark.read
                .format("singlestore")
                .load("iris_db.iris")
)

In [21]:
iris_df.show(5)

[Stage 0:>                                                          (0 + 8) / 8]

+------------+-----------+------------+-----------+---------------+
|sepal_length|sepal_width|petal_length|petal_width|        species|
+------------+-----------+------------+-----------+---------------+
|         6.4|        3.1|         5.5|        1.8| Iris-virginica|
|         4.9|        2.4|         3.3|        1.0|Iris-versicolor|
|         4.8|        3.4|         1.9|        0.2|    Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|    Iris-setosa|
|         5.0|        3.3|         1.4|        0.2|    Iris-setosa|
+------------+-----------+------------+-----------+---------------+
only showing top 5 rows



                                                                                

In [22]:
(iris_df.write
        .format("delta")
        .save("warehouse/delta-table")
)

                                                                                

In [23]:
new_iris_df = (spark.read
                    .format("delta")
                    .load("warehouse/delta-table")
)

In [24]:
new_iris_df.show(5)

                                                                                

+------------+-----------+------------+-----------+---------------+
|sepal_length|sepal_width|petal_length|petal_width|        species|
+------------+-----------+------------+-----------+---------------+
|         5.1|        3.5|         1.4|        0.2|    Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|    Iris-setosa|
|         5.0|        3.5|         1.3|        0.3|    Iris-setosa|
|         5.7|        2.8|         4.1|        1.3|Iris-versicolor|
|         6.5|        3.0|         5.5|        1.8| Iris-virginica|
+------------+-----------+------------+-----------+---------------+
only showing top 5 rows



In [25]:
(new_iris_df.write
            .format("singlestore")
            .option("loadDataCompression", "LZ4")
            .mode("overwrite")
            .save("iris_db.new_iris")
)

In [26]:
%%sql
SELECT * FROM new_iris LIMIT 5;

sepal_length,sepal_width,petal_length,petal_width,species
4.8,3.1,1.6,0.2,Iris-setosa
6.1,2.8,4.0,1.3,Iris-versicolor
4.8,3.0,1.4,0.3,Iris-setosa
5.5,2.3,4.0,1.3,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor


In [27]:
spark.stop()

## Cleanup

In [28]:
shutil.rmtree("jars")
shutil.rmtree("warehouse")

In [29]:
%%sql
DROP TABLE IF EXISTS iris;

In [30]:
%%sql
DROP DATABASE IF EXISTS iris_db;