In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
from pyspark.sql import SparkSession
from typing import Union

In [2]:
def get_minio_credentials(path_to_env_file: Union[str, Path] = "../config/minio.env") -> str:
    """This function returns login and password for root user of minio server, getting those from 'minio.env' file.
    'minio.env' should have MINIO_ROOT_USER and MINIO_ROOT_PASSWORD variables. If there are no such variables, asks user to provide those via input.
    Args:
        path_to_env_file: either string or pathlib.Path object leading to minio.env file."""
    load_dotenv(path_to_env_file)
    MINIO_ROOT_USER = os.getenv("MINIO_ROOT_USER")
    MINIO_ROOT_PASSWORD = os.getenv("MINIO_ROOT_PASSWORD")
    if (MINIO_ROOT_USER and MINIO_ROOT_PASSWORD):
        return MINIO_ROOT_USER, MINIO_ROOT_PASSWORD 
    else:
        print(f"There are no MINIO_ROOT_USER and/or MINIO_ROOT_PASSWORD variables in {path_to_env_file}")

login_minio, password_minio = get_minio_credentials()

spark = (
    SparkSession.builder
    .appName("bronze_to_silver")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", login_minio)
    .config("spark.hadoop.fs.s3a.secret.key", password_minio)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.config("spark.driver.extraJavaOptions", "-Djava.security.manager") # to solve JDK 23+ compatibility problem
    .getOrCreate()
)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/13 15:52:15 WARN Utils: Your hostname, ubuntu-home, resolves to a loopback address: 127.0.1.1; using 192.168.0.29 instead (on interface enp3s0)
26/01/13 15:52:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/13 15:52:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

spark: 3.4.1
hadoop: 3.3.4
aws-java-sdk-bundle: 1.12.262

from pyspark.sql import SparkSession
login_minio, password_minio = "admin", "admin_password"

spark = (
    SparkSession.builder 
    .appName("bronze_to_silver") 
    .master("spark://spark-master:7077") 
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") 
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") 
    .config("spark.hadoop.fs.s3a.access.key", login_minio) 
    .config("spark.hadoop.fs.s3a.secret.key", password_minio) 
    .config("spark.hadoop.fs.s3a.path.style.access", "true") 
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", 
            "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") 
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") 
    .getOrCreate()
        )

s3_file_path = "s3a://airflow.learn/2026-01-13 08:37:19.193383+00:00"

bronze_df = spark.read\
    .format("binaryFile")\
    .load(s3_file_path)