# Version 1

In [1]:
import logging
import pandas as pd

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

from delta import *

In [2]:
SPARK_CONFIGS = {
    "spark.sql.sources.partitionOverwriteMode": "dynamic",
}

In [3]:
# Global variables or constants
INPUT_PATH = "/data/workspace_files/input/"
OUTPUT_PATH = "/data/workspace_files/output/"

In [4]:
# configuring the logger to print logs
log_format1 = '%(asctime)s [%(levelname)-8s] <PID %(process)d:%(processName)s> %(name)s.%(funcName)s: %(message)s'
log_format2 = '%(asctime)s [%(levelname)-8s] %(name)s.%(funcName)s: %(message)s'
log_format3 = '%(asctime)s [%(levelname)-8s] [%(processName)s] %(name)s: %(message)s'

formatter = logging.Formatter(log_format3, datefmt='%d-%b-%Y %H:%M:%S')

console = logging.StreamHandler()
console.setFormatter(formatter)

file_handler = logging.FileHandler(OUTPUT_PATH + '/logs/student_etl_pyspark_stdout.log', "a")
file_handler.setFormatter(formatter)

logger = logging.getLogger("Students Data ETL")

logger.handlers.clear()

logger.addHandler(console)
logger.addHandler(file_handler)

logger.setLevel(logging.INFO)

In [5]:
spark = SparkSession.builder \
                    .appName("PySpark Excercises") \
                    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1,io.delta:delta-spark_2.12:3.1.0") \
                    .config("spark.sql.warehouse.dir", "/data/workspace_files/warehouse") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .enableHiveSupport() \
                    .getOrCreate()

In [6]:
for key in SPARK_CONFIGS:
    spark.conf.set(key, SPARK_CONFIGS[key])

## 01. Read Data from Files

In [7]:
classes_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/classes.json")
students_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/students_data.json")
address_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/student_address*.json")
phone_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/student_phone.json")
marks_df = spark.read.csv(INPUT_PATH + "students/student_marks.csv", header=True, schema="student_id long, class_id long, subject string, marks_obtained integer")

## 02. Transform Data

In [8]:
address_df = address_df.groupBy(address_df.student_id) \
                        .agg(collect_set(struct( \
                                                address_df.type, \
                                                address_df.street_address.alias("address_line1"), \
                                                address_df.address_line_2, \
                                                address_df.city, \
                                                address_df.state, \
                                                address_df.country, \
                                                address_df.postal_code, \
                                                struct(address_df.latitude, address_df.longitude).alias("location") \
                                               )).alias("address"))

In [8]:
phone_df = phone_df.groupBy(phone_df.student_id) \
                    .agg(collect_set(struct( \
                                            phone_df.type, \
                                            phone_df.phone_number, \
                                            phone_df.email \
                                           )).alias("contact_detail"))

In [9]:
marks_df = marks_df.withColumn("total_marks_obtained", sum(marks_df.marks_obtained).over(Window.partitionBy("student_id", "class_id"))) \
                    .withColumn("total_marks", sum(lit(100)).over(Window.partitionBy("student_id", "class_id")))
marks_df = marks_df.groupBy(marks_df.student_id, marks_df.class_id) \
                    .agg( \
                         first(marks_df.total_marks_obtained, ignorenulls=True).alias("total_marks_obtained"), \
                         first(marks_df.total_marks, ignorenulls=True).alias("total_marks"), \
                         collect_set(struct( \
                                            marks_df.subject, \
                                            marks_df.marks_obtained \
                                           )).alias("progress_report") \
                        )

In [10]:
class_marks_df = marks_df.join(classes_df, [marks_df.class_id == classes_df.class_id], "left") \
                         .drop(classes_df.class_id)

In [11]:
class_marks_df = class_marks_df.groupBy(class_marks_df.student_id) \
                               .agg( \
                                    collect_set(struct( \
                                                       class_marks_df.standard, \
                                                       class_marks_df.section, \
                                                       class_marks_df.total_marks_obtained, \
                                                       ((class_marks_df.total_marks_obtained / class_marks_df.total_marks) * 100).alias("percent"), \
                                                       class_marks_df.progress_report \
                                    )).alias("classes") \
                                   )

In [12]:
final_df = students_df.join(phone_df, [students_df.student_id == phone_df.student_id], "left") \
                      .join(address_df, [students_df.student_id == address_df.student_id], "left") \
                      .join(class_marks_df, [students_df.student_id == class_marks_df.student_id], "left") \
                      .drop(phone_df.student_id, address_df.student_id, class_marks_df.student_id)

In [13]:
final_df = final_df.select( \
                           final_df.student_id.alias("_id"), \
                           struct( \
                                  struct(final_df.first_name, final_df.last_name).alias("name"), \
                                  final_df.date_of_birth, \
                                  final_df.gender, \
                                  final_df.hair_color, \
                                  final_df.eye_color, \
                                  final_df.height, \
                                  final_df.weight, \
                                  final_df.hobbies, \
                                  final_df.library_card_number, \
                                  final_df.meal_plan, \
                                  final_df.sports_team, \
                                  final_df.club_membership, \
                                  final_df.tuition_paid, \
                                  final_df.financial_aid, \
                                  final_df.housing_status, \
                                  final_df.contact_detail, \
                                  final_df.address, \
                                  final_df.classes \
                                 ).alias("student") \
                          )

## 03. Load data

In [15]:
final_df.show(5, truncate=False)

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
url = "mongodb+srv://admin:Admin1234567890@datacluster.v8sn2hp.mongodb.net/?retryWrites=true&w=majority&appName=DataCluster"
database = "dummy"
collection = "students"

In [15]:
final_df.write.format("mongodb") \
              .mode("append") \
              .option("mode", "dropMalformed") \
              .option("connection.uri", url) \
              .option("database", database) \
              .option("collection", collection) \
              .option("maxBatchSize", 10000) \
              .save()

Py4JJavaError: Py4JJavaError: An error occurred while calling o350.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: mongodb. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:724)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:863)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:257)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: mongodb.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more


# Version 2

In [1]:
import logging
import pandas as pd

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

from delta import *

In [2]:
SPARK_CONFIGS = {
    "spark.sql.sources.partitionOverwriteMode": "dynamic",
}

In [3]:
# Global variables or constants
INPUT_PATH = "/data/workspace_files/input/"
OUTPUT_PATH = "/data/workspace_files/output/"

In [4]:
# configuring the logger to print logs
log_format1 = '%(asctime)s [%(levelname)-8s] <PID %(process)d:%(processName)s> %(name)s.%(funcName)s: %(message)s'
log_format2 = '%(asctime)s [%(levelname)-8s] %(name)s.%(funcName)s: %(message)s'
log_format3 = '%(asctime)s [%(levelname)-8s] [%(processName)s] %(name)s: %(message)s'

formatter = logging.Formatter(log_format3, datefmt='%d-%b-%Y %H:%M:%S')

console = logging.StreamHandler()
console.setFormatter(formatter)

file_handler = logging.FileHandler(OUTPUT_PATH + '/logs/student_etl_pyspark_stdout.log', "a")
file_handler.setFormatter(formatter)

logger = logging.getLogger("Students Data ETL")

logger.handlers.clear()

logger.addHandler(console)
logger.addHandler(file_handler)

logger.setLevel(logging.INFO)

In [5]:
spark = SparkSession.builder \
                    .appName("PySpark Excercises") \
                    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0") \
                    .config("spark.sql.warehouse.dir", "/data/workspace_files/warehouse") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .enableHiveSupport() \
                    .getOrCreate()

In [6]:
for key in SPARK_CONFIGS:
    spark.conf.set(key, SPARK_CONFIGS[key])

## 01. Read Data from Files

In [7]:
classes_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/classes.json")
students_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/students_data.json")
address_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/student_address*.json")
phone_df = spark.read.format("json").option("multiline", "true").load(INPUT_PATH + "students/student_phone.json")
marks_df = spark.read.csv(INPUT_PATH + "students/student_marks.csv", header=True, schema="student_id long, class_id long, subject string, marks_obtained integer")

## 02. Transform Data

In [8]:
marks_df = marks_df.groupBy(marks_df.student_id, marks_df.class_id) \
                    .agg( \
                         sum(marks_df.marks_obtained).alias("total_marks_obtained"), \
                         sum(lit(100)).alias("total_marks"), \
                         collect_set(struct( \
                                            marks_df.subject, \
                                            marks_df.marks_obtained \
                                           )).alias("progress_report") \
                        )
phone_df = phone_df.withColumnRenamed("type", "phone_type")
address_df = address_df.withColumnRenamed("type", "address_type")

In [9]:
final_df = students_df.join(phone_df, [students_df.student_id == phone_df.student_id], "left") \
                      .join(address_df, [students_df.student_id == address_df.student_id], "left") \
                      .join(marks_df, [students_df.student_id == marks_df.student_id], "left") \
                      .join(classes_df, [marks_df.class_id == classes_df.class_id], "left") \
                      .drop(phone_df.student_id, address_df.student_id, marks_df.student_id, classes_df.class_id)

In [10]:
final_df = final_df.groupBy(final_df.student_id) \
                    .agg( \
                             struct( \
                                    first(final_df.first_name, ignorenulls=True).alias("firstName"), \
                                    first(final_df.last_name, ignorenulls=True).alias("lastName") \
                                   ).alias("name"), \
                             first(final_df.date_of_birth, ignorenulls=True).alias("dateOfBirth"), \
                             first(final_df.gender, ignorenulls=True).alias("gender"), \
                             first(final_df.hair_color, ignorenulls=True).alias("hairColor"), \
                             first(final_df.eye_color, ignorenulls=True).alias("eyeColor"), \
                             first(final_df.height, ignorenulls=True).alias("height"), \
                             first(final_df.weight, ignorenulls=True).alias("weight"), \
                             collect_set(final_df.hobbies).alias("hobbies"), \
                             first(final_df.library_card_number, ignorenulls=True).alias("libraryCardNumber"), \
                             first(final_df.meal_plan, ignorenulls=True).alias("mealPlan"), \
                             first(final_df.sports_team, ignorenulls=True).alias("spartTeam"), \
                             first(final_df.club_membership, ignorenulls=True).alias("clubMembership"), \
                             first(final_df.tuition_paid, ignorenulls=True).alias("isTuitionPaid"), \
                             first(final_df.financial_aid, ignorenulls=True).alias("isFinancialAided"), \
                             first(final_df.housing_status, ignorenulls=True).alias("housingStatus"), \
                             collect_set(struct( \
                                                 final_df.phone_type.alias("type"), \
                                                 final_df.phone_number, \
                                                 final_df.email \
                                                )).alias("contact_detail"), \
                             collect_set(struct( \
                                                final_df.address_type.alias("type"), \
                                                final_df.street_address.alias("address_line1"), \
                                                final_df.address_line_2, \
                                                final_df.city, \
                                                final_df.state, \
                                                final_df.country, \
                                                final_df.postal_code, \
                                                struct(final_df.latitude, final_df.longitude).alias("location") \
                                               )).alias("address"), \
                             collect_set(struct( \
                                                final_df.standard, \
                                                final_df.section, \
                                                marks_df.total_marks_obtained, \
                                                ((marks_df.total_marks_obtained / marks_df.total_marks) * 100).cast("decimal(3,2)").alias("percent"), \
                                                marks_df.progress_report \
                            )).alias("classes")
                           )

In [11]:
final_df = final_df.withColumnRenamed("student_id", "_id")

## 03. Load data

In [12]:
final_df.show(5, truncate=False)

Py4JJavaError: Py4JJavaError: An error occurred while calling o267.showString.
: org.apache.spark.SparkException: Cannot find catalog plugin class for catalog 'spark_catalog': org.apache.spark.sql.delta.catalog.DeltaCatalog.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.catalogPluginClassNotFoundForCatalogError(QueryExecutionErrors.scala:1925)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:70)
	at org.apache.spark.sql.connector.catalog.CatalogManager.loadV2SessionCatalog(CatalogManager.scala:67)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$v2SessionCatalog$2(CatalogManager.scala:86)
	at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$v2SessionCatalog$1(CatalogManager.scala:86)
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.sql.connector.catalog.CatalogManager.v2SessionCatalog(CatalogManager.scala:85)
	at org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:51)
	at org.apache.spark.sql.connector.catalog.CatalogManager.currentCatalog(CatalogManager.scala:122)
	at org.apache.spark.sql.connector.catalog.CatalogManager.currentNamespace(CatalogManager.scala:93)
	at org.apache.spark.sql.catalyst.optimizer.ReplaceCurrentLike.apply(finishAnalysis.scala:143)
	at org.apache.spark.sql.catalyst.optimizer.ReplaceCurrentLike.apply(finishAnalysis.scala:140)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.$anonfun$apply$1(Optimizer.scala:295)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.apply(Optimizer.scala:295)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.apply(Optimizer.scala:275)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.IndexedSeqOptimized.foldLeft(IndexedSeqOptimized.scala:60)
	at scala.collection.IndexedSeqOptimized.foldLeft$(IndexedSeqOptimized.scala:68)
	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:38)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:152)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:144)
	at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:162)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:182)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:179)
	at org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:238)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:284)
	at org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:252)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:117)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3537)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.delta.catalog.DeltaCatalog
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:60)
	... 65 more


In [43]:
url = "mongodb+srv://admin:Admin1234567890@datacluster.v8sn2hp.mongodb.net/?retryWrites=true&w=majority&appName=DataCluster"
database = "dummy"
collection = "students"

In [47]:
final_df.write.format("mongodb") \
              .mode("append") \
              .option("mode", "dropMalformed") \
              .option("connection.uri", url) \
              .option("database", database) \
              .option("collection", collection) \
              .option("maxBatchSize", 10000) \
              .save()

Py4JJavaError: Py4JJavaError: An error occurred while calling o1179.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: mongodb. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:724)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:863)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:257)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:248)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: mongodb.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more
