In [7]:
%%bash
set -euxo pipefail

# ── Configuration ───────────────────────────────────────────
SPARK_VERSION=3.5.5       # latest patch in 3.5 line (May 2025)
HADOOP_VER=3              # Spark 3.5 ships only with Hadoop 3 builds
DL_BASE=https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}

# ── 1. OS-level deps: Java 11 ───────────────────────────────
apt-get update -qq
apt-get install -y openjdk-11-jdk-headless > /dev/null

# ── 2. Download + unpack Spark ──────────────────────────────
wget -q ${DL_BASE}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VER}.tgz
tar -xf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VER}.tgz
mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VER} /opt/spark

# ── 3. Python bindings & utility helpers ────────────────────
pip install -q pyspark==${SPARK_VERSION} findspark

# ── 4. (optional) cleanup tarball to save space ─────────────
rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VER}.tgz

echo "✅ Spark ${SPARK_VERSION} installed in /opt/spark"


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 317.2/317.2 MB 4.5 MB/s eta 0:00:00
✅ Spark 3.5.5 installed in /opt/spark


+ SPARK_VERSION=3.5.5
+ HADOOP_VER=3
+ DL_BASE=https://dlcdn.apache.org/spark/spark-3.5.5
+ apt-get update -qq
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
+ apt-get install -y openjdk-11-jdk-headless
+ wget -q https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
+ tar -xf spark-3.5.5-bin-hadoop3.tgz
+ mv spark-3.5.5-bin-hadoop3 /opt/spark
+ pip install -q pyspark==3.5.5 findspark
+ rm spark-3.5.5-bin-hadoop3.tgz
+ echo '✅ Spark 3.5.5 installed in /opt/spark'


In [8]:
import os, findspark
os.environ["JAVA_HOME"]  = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/opt/spark"
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("local[*]")
    .config("spark.jars.packages",
            "graphframes:graphframes:0.8.4-spark3.5-s_2.12")
    .getOrCreate()
)

print("Spark version:", spark.version)


Spark version: 3.5.5


In [9]:
# Quick smoke test
from graphframes import GraphFrame
v = spark.createDataFrame([("a",), ("b",)], ["id"])
e = spark.createDataFrame([("a", "b")], ["src", "dst"])
GraphFrame(v, e).inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  b|       1|
+---+--------+

