In [1]:
import os
import urllib.request
import subprocess
from pyspark.sql import SparkSession

# MongoDB connector versions compatible with Spark 3.5.0
compatible_jars = {
    "mongo-spark-connector_2.12": {
        "version": "3.0.2",
        "url": "https://repo1.maven.org/maven2/org/mongodb/spark/mongo-spark-connector_2.12/3.0.2/mongo-spark-connector_2.12-3.0.2.jar"
    },
    "bson": {
        "version": "4.6.1",
        "url": "https://repo1.maven.org/maven2/org/mongodb/bson/4.6.1/bson-4.6.1.jar"
    },
    "mongodb-driver-core": {
        "version": "4.6.1",
        "url": "https://repo1.maven.org/maven2/org/mongodb/mongodb-driver-core/4.6.1/mongodb-driver-core-4.6.1.jar"
    },
    "mongodb-driver-sync": {
        "version": "4.6.1",
        "url": "https://repo1.maven.org/maven2/org/mongodb/mongodb-driver-sync/4.6.1/mongodb-driver-sync-4.6.1.jar"
    }
}

# Download compatible versions
jar_dir = "/tmp/spark_jars_compatible"
os.makedirs(jar_dir, exist_ok=True)
jar_paths = []

print("Downloading MongoDB connector compatible with Spark 3.5.0...")
for name, info in compatible_jars.items():
    jar_path = os.path.join(jar_dir, os.path.basename(info["url"]))
    jar_paths.append(jar_path)
    
    if not os.path.exists(jar_path):
        print(f"Downloading {name} {info['version']}...")
        try:
            urllib.request.urlretrieve(info["url"], jar_path)
            file_size = os.path.getsize(jar_path)
            print(f"✅ {name} downloaded ({file_size:,} bytes)")
        except Exception as e:
            print(f"❌ {name} download failed: {e}")
    else:
        print(f"✅ {name} already exists")

# Stop current Spark session
try:
    spark.stop()
    print("Stopped existing Spark session")
except:
    pass

# Create Spark session with compatible JARs
compatible_jars_list = ",".join(jar_paths)
print(f"\nUsing compatible JARs: {compatible_jars_list}")

spark = (SparkSession.builder
    .appName("MongoDBCompatible")
    .master("spark://spark-master:7077")
    .config("spark.jars", compatible_jars_list)
    .getOrCreate())

print("✅ Spark session created with compatible JARs!")

=== Fixing Version Compatibility ===
Trying with older stable connector version...
Downloading MongoDB connector compatible with Spark 3.5.0...
✅ mongo-spark-connector_2.12 already exists
✅ bson already exists
✅ mongodb-driver-core already exists
✅ mongodb-driver-sync already exists

Using compatible JARs: /tmp/spark_jars_compatible/mongo-spark-connector_2.12-3.0.2.jar,/tmp/spark_jars_compatible/bson-4.6.1.jar,/tmp/spark_jars_compatible/mongodb-driver-core-4.6.1.jar,/tmp/spark_jars_compatible/mongodb-driver-sync-4.6.1.jar


26/01/07 23:05:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


✅ Spark session created with compatible JARs!


In [2]:
print("\n=== Testing with Compatible MongoDB Version ===")

try:
    # Use the older but stable connector format
    connection_uri = "mongodb://admin:password123@mongodb:27017/stroke_prediction_test_connection.patients?authSource=admin"
    
    df = (spark.read
          .format("com.mongodb.spark.sql.DefaultSource")
          .option("uri", connection_uri)
          .load())
    
    print("✅ Successfully connected to MongoDB!")
    print(f"Collection schema:")
    df.printSchema()
    
    # Try to show data without count first
    print("Sample data (showing first 5 rows):")
    df.show(5, truncate=False)
    
except Exception as e:
    print(f"❌ Compatible version failed: {str(e)}")
    print("Trying alternative approach...")


=== Testing with Compatible MongoDB Version ===


                                                                                

✅ Successfully connected to MongoDB!
Collection schema:
root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)

Sample data (showing first 5 rows):


                                                                                

+--------------------------+---+-----------------+----+------+-------------+------------+----------+---------------+------+
|_id                       |age|avg_glucose_level|bmi |gender|heart_disease|hypertension|patient_id|smoking_status |stroke|
+--------------------------+---+-----------------+----+------+-------------+------------+----------+---------------+------+
|{695bc8030a46bfc3948de666}|67 |228.69           |36.6|Male  |1            |0           |1         |formerly smoked|1     |
|{695bc8030a46bfc3948de667}|61 |202.21           |28.1|Female|0            |0           |2         |never smoked   |1     |
|{695bc8030a46bfc3948de668}|80 |105.92           |32.5|Female|1            |0           |3         |never smoked   |1     |
|{695bc8030a46bfc3948de669}|49 |171.23           |34.4|Female|0            |0           |4         |smokes         |1     |
|{695bc8030a46bfc3948de66a}|79 |174.12           |24.0|Female|0            |1           |5         |never smoked   |1     |
+-------