In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
from pyspark.sql import SparkSession
from typing import Union

In [2]:
def get_minio_credentials(path_to_env_file: Union[str, Path] = "../config/minio.env") -> str:
    """This function returns login and password for root user of minio server, getting those from 'minio.env' file.
    'minio.env' should have MINIO_ROOT_USER and MINIO_ROOT_PASSWORD variables. If there are no such variables, asks user to provide those via input.
    Args:
        path_to_env_file: either string or pathlib.Path object leading to minio.env file."""
    load_dotenv(path_to_env_file)
    MINIO_ROOT_USER = os.getenv("MINIO_ROOT_USER")
    MINIO_ROOT_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")
    if (MINIO_ROOT_USER and MINIO_ROOT_PASSWORD):
        return MINIO_ROOT_USER, MINIO_ROOT_PASSWORD 
    else:
        print(f"There are no MINIO_ROOT_USER and/or MINIO_ROOT_PASSWORD variables in {path_to_env_file}")

login_minio, password_minio = get_minio_credentials()

spark = (
    SparkSession.builder
    .appName("bronze_to_silver")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", login_minio)
    .config("spark.hadoop.fs.s3a.secret.key", password_minio)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.config("spark.driver.extraJavaOptions", "-Djava.security.manager") # to solve JDK 23+ compatibility problem
    .getOrCreate()
)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/13 15:52:15 WARN Utils: Your hostname, ubuntu-home, resolves to a loopback address: 127.0.1.1; using 192.168.0.29 instead (on interface enp3s0)
26/01/13 15:52:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/13 15:52:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

spark: 3.4.1
hadoop: 3.3.4
aws-java-sdk-bundle: 1.12.262

from pyspark.sql import SparkSession
login_minio, password_minio = "admin", "admin_password"

spark = (
    SparkSession.builder 
    .appName("bronze_to_silver") 
    .master("spark://spark-master:7077") 
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") 
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") 
    .config("spark.hadoop.fs.s3a.access.key", login_minio) 
    .config("spark.hadoop.fs.s3a.secret.key", password_minio) 
    .config("spark.hadoop.fs.s3a.path.style.access", "true") 
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") 
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") 
    .getOrCreate()
        )

s3_file_path = "s3a://airflow.learn/2026-01-13 08:37:19.193383+00:00"

bronze_df = spark.read\
    .format("binaryFile")\
    .load(s3_file_path)

In [None]:
from pyspark.sql import SparkSession
login_minio, password_minio = "admin", "admin_password"
jars = "jars/hadoop-aws-3.4.2.jar,jars/bundle-2.29.52.jar"
spark = (
    SparkSession.builder 
    .appName("bronze_to_silver") 
    .config("spark.jars", jars) 
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") 
    .config("spark.hadoop.fs.s3a.access.key", login_minio) 
    .config("spark.hadoop.fs.s3a.secret.key", password_minio) 
    .config("spark.hadoop.fs.s3a.path.style.access", "true") 
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") 
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") 
    .getOrCreate()
        )



In [1]:
pip install pyspark==3.5.8

Collecting pyspark==3.5.8
  Downloading pyspark-3.5.8.tar.gz (317.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.8/317.8 MB[0m [31m4.9 MB/s[0m  [33m0:01:05[0m[0m eta [36m0:00:01[0m[36m0:00:02[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.8-py2.py3-none-any.whl size=318353002 sha256=691fbe262e75fd0f2edf8b319c39fa83a9fbd46283a6f70be2353e9e6f621b46
  Stored in directory: /home/heinz/.cache/pip/wheels/f0/f6/86/a9231691706c40d5bcc8c907f583e0ef90c075dcfa97e272d0
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.8
Note: you may need to restart the kernel to use updated packages.


In [10]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .remote("sc://localhost:15002")
    .getOrCreate()
)

spark.range(10).show()

KeyboardInterrupt: 

In [8]:
spark._jsc.hadoopConfiguration().get("fs.s3a.endpoint")

Collecting grpcio-status
  Downloading grpcio_status-1.76.0-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<7.0.0,>=6.31.1 (from grpcio-status)
  Downloading protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting grpcio>=1.76.0 (from grpcio-status)
  Downloading grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.7 kB)
Downloading grpcio_status-1.76.0-py3-none-any.whl (14 kB)
Downloading protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl (323 kB)
Downloading grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (6.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m8.9 MB/s[0m  [33m0:00:00[0mm [31m10.1 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: protobuf, grpcio, grpcio-status
[2K  Attempting uninstall: protobuf
[2K    Found existing installation: protobuf 4.25.8
[2K    Uninstalling protobuf-4.25.8:
[2K      Successfully 

In [None]:
spark.read.format("binaryFile") \
    .load("s3a://airflow.learning/") \
    .limit(5) \
    .show(truncate=False)