In [2]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [3]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.ui.port,4052
spark.app.startTime,1713315741056
spark.executor.cores,2
spark.sql.shuffle.partitions,32


<h2>Q2 <h2>
<h2>You can create user defined functions in Spark by taking native Python functions and wrapping
them with pyspark.sql.functions.udf which allows you to apply a function to each row using
columns as inputs. You may find this functionality useful.<h2>

In [5]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col
from pyspark.sql.types import DoubleType
import math

In [7]:
station_data = spark.read.parquet("/user/uwi14/outputs/ghcnd/Final_stations_df.parquet")
station_data.printSchema()
show_as_html(station_data,10)

root
 |-- ID: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- GSN_FLAG: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: string (nullable = true)
 |-- Country_Name: string (nullable = true)
 |-- State_Name: string (nullable = true)
 |-- FIRST_ACTIVE_YEAR: string (nullable = true)
 |-- LAST_ACTIVE_YEAR: string (nullable = true)
 |-- DIFFERENT_ELEMENTS_COUNT: long (nullable = true)
 |-- CORE_ELEMENTS_COUNT: long (nullable = true)
 |-- OTHER_ELEMENTS_COUNT: long (nullable = true)
 |-- COLLECTED_ELEMENTS: array (nullable = true)
 |    |-- element: string (containsNull = true)



Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AE000041196,,AE,25.333,55.517,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,1944,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
1,AEM00041218,,AE,24.262,55.609,264.9,AL AIN INTL,,,41218.0,United Arab Emirates,,1994,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
2,AGE00147715,,AG,35.42,8.1197,863.0,TEBESSA,,,,Algeria,,1879,1938,3,3,,"[TMAX, TMIN, PRCP]"
3,AGE00147794,,AG,36.78,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,1926,1938,2,2,,"[TMAX, TMIN]"
4,AGM00060402,,AG,36.712,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,����,2024,5,4,1.0,"[TMAX, TMIN, ����, PRCP, SNWD]"
5,AGM00060430,,AG,36.3,2.233,721.0,MILIANA,,,60430.0,Algeria,,1957,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
6,AGM00060461,,AG,35.7,-0.65,22.0,ORAN-PORT,,,60461.0,Algeria,,1995,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
7,AGM00060514,,AG,35.167,2.317,801.0,KSAR CHELLALA,,,60514.0,Algeria,,1995,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
8,AGM00060515,,AG,35.333,4.206,459.0,BOU SAADA,,,60515.0,Algeria,,1984,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
9,AGM00060550,,AG,33.667,1.0,1347.0,EL-BAYADH,,,60550.0,Algeria,,1973,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"


<h3>(a) Write a Spark function that computes the geographical distance between two stations using
their latitude and longitude as arguments. You can test this function by using CROSS JOIN
on a small subset of stations to generate a table with two stations in each row.
Note that there is more than one way to compute geographical distance, choose a method
that at least takes into account that the earth is spherica<h3>

In [8]:
station_data = station_data.withColumn("LATITUDE", col("LATITUDE").cast(DoubleType())) \
                           .withColumn("LONGITUDE", col("LONGITUDE").cast(DoubleType()))


station_data.printSchema()
show_as_html(station_data,10)

root
 |-- ID: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CODE: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- GSN_FLAG: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: string (nullable = true)
 |-- Country_Name: string (nullable = true)
 |-- State_Name: string (nullable = true)
 |-- FIRST_ACTIVE_YEAR: string (nullable = true)
 |-- LAST_ACTIVE_YEAR: string (nullable = true)
 |-- DIFFERENT_ELEMENTS_COUNT: long (nullable = true)
 |-- CORE_ELEMENTS_COUNT: long (nullable = true)
 |-- OTHER_ELEMENTS_COUNT: long (nullable = true)
 |-- COLLECTED_ELEMENTS: array (nullable = true)
 |    |-- element: string (containsNull = true)



Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AE000041196,,AE,25.333,55.517,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,1944,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
1,AEM00041218,,AE,24.262,55.609,264.9,AL AIN INTL,,,41218.0,United Arab Emirates,,1994,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
2,AGE00147715,,AG,35.42,8.1197,863.0,TEBESSA,,,,Algeria,,1879,1938,3,3,,"[TMAX, TMIN, PRCP]"
3,AGE00147794,,AG,36.78,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,1926,1938,2,2,,"[TMAX, TMIN]"
4,AGM00060402,,AG,36.712,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,����,2024,5,4,1.0,"[TMAX, TMIN, ����, PRCP, SNWD]"
5,AGM00060430,,AG,36.3,2.233,721.0,MILIANA,,,60430.0,Algeria,,1957,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
6,AGM00060461,,AG,35.7,-0.65,22.0,ORAN-PORT,,,60461.0,Algeria,,1995,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
7,AGM00060514,,AG,35.167,2.317,801.0,KSAR CHELLALA,,,60514.0,Algeria,,1995,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
8,AGM00060515,,AG,35.333,4.206,459.0,BOU SAADA,,,60515.0,Algeria,,1984,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
9,AGM00060550,,AG,33.667,1.0,1347.0,EL-BAYADH,,,60550.0,Algeria,,1973,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"


In [9]:
def haversine_distance(lat1, lon1, lat2, lon2):
    # Check for invalid input and return None if found
    if any(v is None for v in [lat1, lon1, lat2, lon2]):
        return None

    # Radius of the Earth in kilometers
    R = 6371.0
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Compute differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Apply Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

# Register the Python function as a UDF
haversine_udf = udf(haversine_distance, DoubleType())


# Alias the original DataFrame to avoid ambiguity
station_data_alias = station_data.alias("a")

# Apply CROSS JOIN to generate all pairs of stations (use with caution as this can explode quickly)
cross_joined = station_data_alias.crossJoin(station_data.alias("b"))

# Calculate the distance using the UDF, specifying the DataFrame alias to avoid ambiguity
result = cross_joined.withColumn(
    "Distance_km",
    haversine_udf(
        col("a.LATITUDE"),
        col("a.LONGITUDE"),
        col("b.LATITUDE"),
        col("b.LONGITUDE")
    )
)

# Show the results
distance_df = result.select(
    col("a.NAME").alias("Station1"),
    col("b.NAME").alias("Station2"),
    "Distance_km"
)
show_as_html(distance_df,10)


Unnamed: 0,Station1,Station2,Distance_km
0,SHARJAH INTER. AIRP,SHARJAH INTER. AIRP,0.0
1,SHARJAH INTER. AIRP,AL AIN INTL,119.451291
2,SHARJAH INTER. AIRP,TEBESSA,4637.495763
3,SHARJAH INTER. AIRP,BEJAIA-CAP CARBON,4909.724307
4,SHARJAH INTER. AIRP,SOUMMAM,4912.380058
5,SHARJAH INTER. AIRP,MILIANA,5166.657741
6,SHARJAH INTER. AIRP,ORAN-PORT,5429.171901
7,SHARJAH INTER. AIRP,KSAR CHELLALA,5164.547991
8,SHARJAH INTER. AIRP,BOU SAADA,4992.393402
9,SHARJAH INTER. AIRP,EL-BAYADH,5296.809414


<h3>(b) Apply this function to compute the pairwise distances between all stations in New Zealand,
and save the result to your output directory.
What two stations are geographically the closest together in New Zealand?<h3>

In [12]:
nz_stations = station_data.filter(station_data["Country_Name"].contains("New Zealand"))
show_as_html(nz_stations ,10)

Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,NZ000093417,,NZ,-40.9,174.983,7.0,PARAPARAUMU AWS,GSN,,93420,New Zealand,,1972,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
1,NZM00093781,,NZ,-43.489,172.532,37.5,CHRISTCHURCH INTL,,,93781,New Zealand,,1954,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
2,TL000091724,,TL,-9.2,-171.917,3.0,NUKUNONO,,,91724,Tokelau [New Zealand],,1973,1995,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
3,NZ000939450,,NZ,-52.55,169.167,19.0,CAMPBELL ISLAND AWS,GSN,,93947,New Zealand,,1941,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
4,NZM00093929,,NZ,-50.483,166.3,40.0,ENDERBY ISLAND AWS,,,93929,New Zealand,,1992,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
5,CWM00091843,,CW,-21.203,-159.806,5.8,RAROTONGA INTL,GSN,,91843,Cook Islands [New Zealand],,1973,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
6,NZ000933090,,NZ,-39.017,174.183,32.0,NEW PLYMOUTH AWS,GSN,,93309,New Zealand,,1944,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
7,NZ000093844,,NZ,-46.417,168.333,2.0,INVERCARGILL AIRPOR,GSN,,93845,New Zealand,,1948,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
8,NZ000093994,,NZ,-29.25,-177.917,49.0,RAOUL ISL/KERMADEC,,,93997,New Zealand,,1940,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
9,NZ000937470,,NZ,-44.517,169.9,488.0,TARA HILLS,GSN,,93747,New Zealand,,1949,2005,2,2,,"[TMAX, TMIN]"


In [14]:
#Number of unique stations belongs to New Zealand
nz_station_count = nz_stations.select('ID').count()
print(f"Number of stations belongs to New Zealand: {nz_station_count}")

Number of stations belongs to New Zealand: 18


In [15]:
#Number of unique stations in New Zealand

nz_only_station_count = nz_stations.filter(nz_stations['CODE']=="NZ").count()

print(f"Number of stations in New Zealand: {nz_only_station_count}")

Number of stations in New Zealand: 15


In [23]:
#Number of unique stations belongs to New Zealand situated out of the country
nz_station_out = nz_stations.filter(nz_stations['CODE']!="NZ")
nz_station_out_count = nz_station_out.count()
show_as_html(nz_station_out,5)
print(f"Number of stations out of New Zealand: {nz_station_out_count}")

Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,TL000091724,,TL,-9.2,-171.917,3.0,NUKUNONO,,,91724,Tokelau [New Zealand],,1973,1995,4,3,1,"[TMAX, TMIN, PRCP, TAVG]"
1,CWM00091843,,CW,-21.203,-159.806,5.8,RAROTONGA INTL,GSN,,91843,Cook Islands [New Zealand],,1973,2024,4,3,1,"[TMAX, TMIN, PRCP, TAVG]"
2,NEM00091824,,NE,-19.083,-169.933,59.0,HANAN AIRPORT,GSN,,91824,Niue [New Zealand],,2001,2016,4,3,1,"[TMAX, TMIN, PRCP, TAVG]"


Number of stations out of New Zealand: 3


In [25]:
# Alias for self-join
nz_stations_a = nz_stations.alias("a")
nz_stations_b = nz_stations.alias("b")

# Cross join to get all pairs (excluding pairs of the same station)
station_pairs = nz_stations_a.crossJoin(nz_stations_b) \
                            .filter(col("a.ID") != col("b.ID"))

# Calculate distances
station_distances_nz = station_pairs.withColumn(
    "Distance_km",
    haversine_udf(
        col("a.LATITUDE"),
        col("a.LONGITUDE"),
        col("b.LATITUDE"),
        col("b.LONGITUDE")
    )
)
station_distances_nz = station_distances_nz.select(
    col("a.NAME").alias("Station1"),
    col("b.NAME").alias("Station2"),
    "Distance_km"
)


show_as_html(station_distances_nz,5)

Unnamed: 0,Station1,Station2,Distance_km
0,PARAPARAUMU AWS,CHRISTCHURCH INTL,351.596419
1,PARAPARAUMU AWS,NUKUNONO,3753.698198
2,PARAPARAUMU AWS,CAMPBELL ISLAND AWS,1368.057682
3,PARAPARAUMU AWS,ENDERBY ISLAND AWS,1259.151158
4,PARAPARAUMU AWS,RAROTONGA INTL,3228.017528


In [28]:
# Find the pair with the minimum distance
sorted_pair_nz = station_distances_nz.orderBy("Distance_km")
closest_pair_nz = sorted_pair_nz.first()
show_as_html(sorted_pair_nz,5)

# Show the result
print(f"The closest stations are {closest_pair_nz['Station1']} and {closest_pair_nz['Station2']} with a distance of {closest_pair_nz['Distance_km']} km.")

Unnamed: 0,Station1,Station2,Distance_km
0,WELLINGTON AERO AWS,PARAPARAUMU AWS,50.529026
1,PARAPARAUMU AWS,WELLINGTON AERO AWS,50.529026
2,WELLINGTON AERO AWS,KAIKOURA,151.071435
3,KAIKOURA,WELLINGTON AERO AWS,151.071435
4,HOKITIKA AERODROME,CHRISTCHURCH INTL,152.258357


The closest stations are WELLINGTON AERO AWS           and PARAPARAUMU AWS               with a distance of 50.52902648213863 km.


In [29]:
output_dir = "hdfs:///user/uwi14/outputs/ghcnd/sorted_stations_nz_df.csv"
sorted_pair_nz.write.mode("overwrite").option("header", "true").option("compression", "gzip").csv(output_dir)

In [30]:
!hdfs dfs -du -h /user/uwi14/outputs/ghcnd/

5.0 M    20.1 M  /user/uwi14/outputs/ghcnd/Final_stations_df.parquet
1.5 M    6.0 M   /user/uwi14/outputs/ghcnd/Modified_Inventory_df.parquet
256.0 K  1.0 M   /user/uwi14/outputs/ghcnd/avg_rainfall.csv
1.9 K    7.5 K   /user/uwi14/outputs/ghcnd/countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_state_df.csv
2.3 K    9.0 K   /user/uwi14/outputs/ghcnd/modified_countries_df.csv
865      3.4 K   /user/uwi14/outputs/ghcnd/modified_states_df.csv
2.1 M    8.3 M   /user/uwi14/outputs/ghcnd/nz_tmin_tmax.parquet
2.8 M    11.3 M  /user/uwi14/outputs/ghcnd/nz_tmin_tmax2.csv
7.5 K    29.8 K  /user/uwi14/outputs/ghcnd/sorted_stations_nz_df.csv
623      2.4 K   /user/uwi14/outputs/ghcnd/states_df.csv


In [4]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()