# Iniciando uma sessão Spark para testes

In [8]:
import os
from pyspark.sql import SparkSession

os.makedirs(r"C:\tmp\spark", exist_ok=True)
os.makedirs(r"C:\tmp\hive", exist_ok=True)

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("UI-Stable")
    # UI/host
    .config("spark.ui.enabled", "true")
    .config("spark.ui.host", "127.0.0.1")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    # dirs/temp
    .config("spark.local.dir", r"C:\tmp\spark")
    .config("spark.sql.warehouse.dir", r"C:\tmp\hive")
    # performance/estabilidade (menos memória)
    .config("spark.sql.shuffle.partitions", "16")
    .config("spark.default.parallelism", "16")
    # limites de memória (evita estourar)
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

print("JAVA_HOME:", os.getenv("JAVA_HOME"))
print("SPARK_HOME:", os.getenv("SPARK_HOME"))
print("HADOOP_HOME:", os.getenv("HADOOP_HOME"))

print("Spark version:", spark.version)
print("Spark UI:", spark.sparkContext.uiWebUrl)
print("spark.sql.warehouse.dir:", spark.conf.get("spark.sql.warehouse.dir"))


spark.range(10).show()
# spark.stop()

JAVA_HOME: C:\Java\jdk-25.0.2
SPARK_HOME: C:\Spark
HADOOP_HOME: C:\hadoop
Spark version: 4.1.1
Spark UI: http://127.0.0.1:4040
spark.sql.warehouse.dir: C:\tmp\hive
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



# verificação do Spark UI

In [9]:
print("uiWebUrl:", spark.sparkContext.uiWebUrl)
print("spark.ui.enabled:", spark.conf.get("spark.ui.enabled", "true"))
print("spark.ui.port:", spark.conf.get("spark.ui.port", "4040"))

uiWebUrl: http://127.0.0.1:4040
spark.ui.enabled: true
spark.ui.port: 4040


# rodar um job “demorado” pra aparecer Jobs/Stages na UI

In [10]:
df = spark.range(0, 20_000_000).repartition(16)
df.groupBy((df.id % 1000).alias("k")).count().count()

1000

In [11]:
from pyspark.sql import functions as F

df = spark.range(0, 30_000_000).withColumn("k", (F.col("id") % 10000))
df = df.repartition(16, "k")
res = df.groupBy("k").count()
res.count()

10000

In [14]:
spark.stop()