<img src = "https://github.com/singlestore-labs/spaces-notebooks/blob/e551e274bb67bb1e5081131ee1150cdba713fc43/common/images/singlestore-jupyter.png?raw=true">

<div id="singlestore-header" style="display: flex; background-color: rgba(235, 249, 245, 0.25); padding: 5px;">
    <div id="icon-image" style="width: 90px; height: 90px;">
        <img width="100%" height="100%" src="https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/browser.png" />
    </div>
    <div id="text" style="padding: 5px; margin-left: 10px;">
        <div id="badge" style="display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%">SingleStore Notebooks</div>
        <h1 style="font-weight: 500; margin: 8px 0 0 4px;">Using Apache Spark GraphFrames with SingleStore Notebooks</h1>
    </div>
</div>

In [4]:
!pip cache purge --quiet
!conda install -y --quiet -c conda-forge openjdk pyspark

[0mCollecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [5]:
!pip install folium --quiet
!pip install graphframes --quiet

In [6]:
from pyspark.sql import SparkSession

# List of Maven coordinates for all required packages
maven_packages = [
    "graphframes:graphframes:0.8.3-spark3.5-s_2.12",
    "org.scala-lang:scala-library:2.12",
    "com.singlestore:singlestore-jdbc-client:1.2.1",
    "com.singlestore:singlestore-spark-connector_2.12:4.1.5-spark-3.5.0",
    "org.apache.commons:commons-dbcp2:2.12.0",
    "org.apache.commons:commons-pool2:2.12.0",
    "io.spray:spray-json_3:1.3.6"
]

# Create Spark session with all required packages
spark = (SparkSession
             .builder
             .config("spark.jars.packages", ",".join(maven_packages))
             .appName("Spark GraphFrames Test")
             .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
graphframes#graphframes added as a dependency
org.scala-lang#scala-library added as a dependency
com.singlestore#singlestore-jdbc-client added as a dependency
com.singlestore#singlestore-spark-connector_2.12 added as a dependency
org.apache.commons#commons-dbcp2 added as a dependency
org.apache.commons#commons-pool2 added as a dependency
io.spray#spray-json_3 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c53033ea-4fdf-4ada-9393-875019e4d2c5;1.0
	confs: [default]
	found graphframes#graphframes;0.8.3-spark3.5-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
	found com.singlestore#singlestore-jdbc-client;1.2.1 in central
	found com.singlestore#singlestore-spark-connector_2.12;4.1.5-spark-3.5.0 in central
	found org.apache.avro#avro;1.11.3 in central
	found com.fasterxml.jackson.core#jackson-core;2.14.2 in central
	fo

In [7]:
host = "<HOST>"
password = "<PASSWORD>"
port = "3306"
cluster = host + ":" + port

In [8]:
spark.conf.set("spark.datasource.singlestore.ddlEndpoint", cluster)
spark.conf.set("spark.datasource.singlestore.user", "admin")
spark.conf.set("spark.datasource.singlestore.password", password)
spark.conf.set("spark.datasource.singlestore.disablePushdown", "false")

In [9]:
%%sql
DROP DATABASE IF EXISTS spark_demo;
CREATE DATABASE IF NOT EXISTS spark_demo;

In [10]:
%%sql
USE spark_demo;

DROP TABLE IF EXISTS connections;
CREATE ROWSTORE TABLE IF NOT EXISTS connections (
     src      INT,
     dst      INT,
     line     VARCHAR(32),
     colour   VARCHAR(8),
     time     INT,
     PRIMARY KEY(src, dst, line)
);

DROP TABLE IF EXISTS stations;
CREATE ROWSTORE TABLE IF NOT EXISTS stations (
     id          INT PRIMARY KEY,
     latitude    DOUBLE,
     longitude   DOUBLE,
     name        VARCHAR(32),
     zone        FLOAT,
     total_lines INT,
     rail        INT
);

In [11]:
import pandas as pd

connections_url = "https://raw.githubusercontent.com/VeryFatBoy/singlestore-geospatial-example/main/csv/london_connections.csv"
stations_url = "https://raw.githubusercontent.com/VeryFatBoy/singlestore-geospatial-example/main/csv/london_stations.csv"
lines_url = "https://raw.githubusercontent.com/VeryFatBoy/singlestore-geospatial-example/main/csv/london_lines.csv"

connections_df = pd.read_csv(connections_url)
connections_df.rename(
    columns = {"station1": "src", "station2": "dst"},
    inplace = True
)

stations_df = pd.read_csv(stations_url)
stations_df.drop(
    "display_name",
    axis = 1,
    inplace = True
)

lines_df = pd.read_csv(lines_url)
lines_df.drop(
    "stripe",
    axis = 1,
    inplace = True
)

connections_df = pd.merge(
    connections_df,
    lines_df,
    on = "line",
    how = "left"
)
connections_df.drop(
    "line",
    axis = 1,
    inplace = True
)
connections_df.rename(
    columns = {"name": "line"},
    inplace = True
)

In [12]:
import folium

London = [51.509865, -0.118092]
mymap = folium.Map(location = London, zoom_start = 12)

# Add markers for stations
for idx, row in stations_df.iterrows():
    folium.Marker(
        [row["latitude"], row["longitude"]],
        popup = row["name"]
    ).add_to(mymap)

# Add lines with colours
for idx, row in connections_df.iterrows():
    source = stations_df.loc[stations_df["id"] == row["src"]]
    target = stations_df.loc[stations_df["id"] == row["dst"]]
    
    # Extract latitude and longitude
    source_coords = (float(source["latitude"].iloc[0]), float(source["longitude"].iloc[0]))
    target_coords = (float(target["latitude"].iloc[0]), float(target["longitude"].iloc[0]))
    
    folium.PolyLine(
        locations = [source_coords, target_coords],
        color = row["colour"]
    ).add_to(mymap)

mymap

<div class="alert alert-block alert-warning">
    <b class="fa fa-solid fa-exclamation-circle"></b>
    <div>
        <p><b>Action Required</b></p>
        <p>Select the database from the drop-down menu at the top of this notebook. It updates the <b>connection_url</b> which is used by SQLAlchemy to make connections to the selected database.</p>
    </div>
</div>

In [13]:
from sqlalchemy import *

db_connection = create_engine(connection_url)

In [14]:
connections_df.to_sql(
    "connections",
    con = db_connection,
    if_exists = "append",
    index = False,
    chunksize = 1000
)

406

In [15]:
stations_df.to_sql(
    "stations",
    con = db_connection,
    if_exists = "append",
    index = False,
    chunksize = 1000
)

302

In [16]:
%%sql
SELECT * FROM connections LIMIT 5;

src,dst,line,colour,time
1,265,Piccadilly Line,#003688,3
2,156,Metropolitan Line,#9B0056,2
11,104,Metropolitan Line,#9B0056,3
11,212,Bakerloo Line,#B36305,2
13,156,Central Line,#E32017,2


In [17]:
%%sql
SELECT * FROM stations LIMIT 5;

id,latitude,longitude,name,zone,total_lines,rail
1,51.5028,-0.2801,Acton Town,3.0,2,0
3,51.5154,-0.0726,Aldgate East,1.0,2,0
6,51.6736,-0.607,Amersham,10.0,1,1
16,51.5856,0.0887,Barkingside,5.0,1,0
18,51.5121,-0.1879,Bayswater,1.0,2,0


In [18]:
connections = (spark.read
    .format("singlestore")
    .load("spark_demo.connections")
)

In [19]:
stations = (spark.read
    .format("singlestore")
    .load("spark_demo.stations")
)

In [20]:
from graphframes import GraphFrame

underground = GraphFrame(stations, connections)

In [21]:
underground.vertices.show(5)



+---+--------+---------+---------------+----+-----------+----+
| id|latitude|longitude|           name|zone|total_lines|rail|
+---+--------+---------+---------------+----+-----------+----+
| 25|  51.512|  -0.1031|    Blackfriars| 1.0|          2|   0|
| 39| 51.5481|  -0.1188|Caledonian Road| 2.0|          1|   0|
| 43| 51.5147|   0.0082|   Canning Town| 3.0|          2|   0|
| 50| 51.7052|   -0.611|        Chesham|10.0|          1|   0|
| 60| 51.5129|  -0.1243|  Covent Garden| 1.0|          1|   0|
+---+--------+---------+---------------+----+-----------+----+
only showing top 5 rows



                                                                                

In [22]:
underground.edges.show(5)

[Stage 2:>                                                          (0 + 2) / 2]

+---+---+--------------------+-------+----+
|src|dst|                line| colour|time|
+---+---+--------------------+-------+----+
|  7|145|       Northern Line|#000000|   2|
| 11|163|       Bakerloo Line|#B36305|   1|
| 19| 97|Docklands Light R...|#00A4A7|   2|
| 28|192|        Central Line|#E32017|   1|
| 49|151|       Northern Line|#000000|   2|
+---+---+--------------------+-------+----+
only showing top 5 rows



                                                                                

In [23]:
(underground
    .vertices
    .groupBy("zone")
    .count()
    .orderBy("count", ascending = False)
    .show()
)

+----+-----+
|zone|count|
+----+-----+
| 2.0|   75|
| 1.0|   60|
| 3.0|   47|
| 4.0|   38|
| 5.0|   28|
| 6.0|   18|
| 2.5|   17|
| 3.5|    6|
| 1.5|    4|
| 8.0|    2|
|10.0|    2|
| 7.0|    2|
| 9.0|    1|
| 6.5|    1|
| 5.5|    1|
+----+-----+



In [24]:
(underground
    .edges
    .filter("line = 'District Line'")
    .count()
)

                                                                                

59

In [25]:
(underground
    .vertices
    .groupBy()
    .max("total_lines")
    .show()
)

[Stage 7:>                                                          (0 + 2) / 2]

+----------------+
|max(total_lines)|
+----------------+
|               6|
+----------------+



                                                                                

In [26]:
(underground
    .vertices
    .filter("total_lines == 6")
    .show()
)

+---+--------+---------+--------------------+----+-----------+----+
| id|latitude|longitude|                name|zone|total_lines|rail|
+---+--------+---------+--------------------+----+-----------+----+
|145| 51.5308|  -0.1238|King's Cross St. ...| 1.0|          6|   1|
+---+--------+---------+--------------------+----+-----------+----+



                                                                                

In [27]:
spark.stop()