In [1]:
from pymongo import MongoClient

from contextlib import contextmanager
import pandas as pd

@contextmanager
def connect_mongodb(username : str, password : str, host : str ,port : str):
    uri = f"mongodb://{username}:{password}@{host}:{port}"
    client  = None
    try:
        print("Connecting to MongoDB")
        client = MongoClient(uri)
        print("Connect to MongoDB")
        yield client
    except Exception as e:
        print(f"Error: {e}")
    finally:
        if client:
            client.close()
            print("Stop Connect to MongoDB")
        else:
            print("Can't generate MongoDB Client")
        
class MongoDB_Operation:
    def __init__(self, client):
        """Initialize MongoDB connection."""
        if not isinstance(client, MongoClient):
            raise TypeError("client must be an instance of MongoClient")
        self.client = client

    def check_database_exists(self, db_name):
        """Check if a database exists."""
        return db_name in self.client.list_database_names()

    def check_collection_exists(self, db_name, collection_name):
        """Check if a collection exists in a database."""
        return collection_name in self.client[db_name].list_collection_names()

    def create_database(self, db_name):
        """Create a database if it does not exist."""
        if self.check_database_exists(db_name):
            print(f"Database '{db_name}' already exists!")
            return self.client[db_name]
        else:
            self.client[db_name].command("ping")
            print(f"Database '{db_name}' has been created successfully.")
            return self.client[db_name]
            
    def create_collection(self, db_name, collection_name):
        """Create a collection if it does not exist."""

        db = self.create_database(db_name)

        if self.check_collection_exists(db_name, collection_name):
            print(f"Collection '{collection_name}' already exists!")
            return db[collection_name]
        else:
            db.create_collection(collection_name)
            print(f"Collection '{collection_name}' has been created successfully.")
            return db[collection_name]

    def find_data(self, db_name, collection_name, query=None):
        """Find data in a collection."""
        if not isinstance(query, dict):
            raise TypeError("Query must be a dictionary")
        if not self.check_database_exists(db_name):
            print(f"Database '{db_name}' does not exist!")
        if not self.check_collection_exists(db_name, collection_name):
            print(f"Collection '{collection_name}' does not exist!")

        query = query or {}
        data = self.client[db_name][collection_name].find(query)
        return pd.DataFrame(data)

    def insert_many(self, db_name, collection_name, data : pd.DataFrame):
        """Insert multiple records into a collection."""
        if not isinstance(data, pd.DataFrame):
            raise TypeError("Data must be a Pandas DataFrame")
        
        coll = self.create_collection(db_name=db_name, collection_name=collection_name)

        data_dict = data.to_dict(orient='records')

        if data_dict:
            coll.insert_many(data_dict)
            print("Data has been added successfully.")
        else:
            print("Empty data, nothing to insert.")

In [2]:
import requests
API_KEY = "2d6e1b290dabf74f65b84431677db2b8"
GET_URL = "https://api.themoviedb.org/3/movie"


def get_movie_data(API_KEY, GET_URL, list_ids):
    list_movies = []
    for movie_id in list_ids:
        url = f'{GET_URL}/{movie_id}?api_key={API_KEY}'
        response = requests.get(url)
        data = response.json()
        list_movies.append(data)
    return list_movies

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Bronze Process") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.1") \
    .config("spark.mongodb.input.uri", "mongodb://ndtien_mongo:ndtien_mongo@mongodb:27017/") \
    .config("spark.mongodb.output.uri", "mongodb://ndtien_mongo:ndtien_mongo@mongodb:27017/") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8ab49ce-52f9-4559-bd2c-55ee95ed8f30;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.4.1 in central
	found org.mongodb#mongodb-driver-sync;5.1.4 in central
	[5.1.4] org.mongodb#mongodb-driver-sync;[5.1.1,5.1.99)
	found org.mongodb#bson;5.1.4 in central
	found org.mongodb#mongodb-driver-core;5.1.4 in central
	found org.mongodb#bson-record-codec;5.1.4 in central
:: resolution report :: resolve 2001ms :: artifacts dl 7ms
	:: modules in use:
	org.mongodb#bson;5.1.4 from central in [default]
	org.mongodb#bson-record-codec;5.1.4 from central in [default]
	org.mongodb#mongodb-driver-core;5.1.4 from central in [default]
	org.mongodb#mongodb-driver-sync;5.1.4 from central in [default]
	org.mongodb.spark#mongo-spark-connector_

In [4]:
df = spark.read\
            .format("mongodb")\
            .option("database", "movies_db")\
            .option("collection", "movies_name")\
            .load();

Py4JJavaError: An error occurred while calling o38.load.
: com.mongodb.MongoTimeoutException: Timed out while waiting for a server that matches ReadPreferenceServerSelector{readPreference=primary}. Client view of cluster state is {type=UNKNOWN, servers=[{address=localhost:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused}}]
	at com.mongodb.internal.connection.BaseCluster.createAndLogTimeoutException(BaseCluster.java:392)
	at com.mongodb.internal.connection.BaseCluster.selectServer(BaseCluster.java:148)
	at com.mongodb.internal.connection.SingleServerCluster.selectServer(SingleServerCluster.java:46)
	at com.mongodb.internal.binding.ClusterBinding.getReadConnectionSource(ClusterBinding.java:108)
	at com.mongodb.client.internal.ClientSessionBinding.getConnectionSource(ClientSessionBinding.java:128)
	at com.mongodb.client.internal.ClientSessionBinding.getReadConnectionSource(ClientSessionBinding.java:92)
	at com.mongodb.internal.operation.SyncOperationHelper.withSuppliedResource(SyncOperationHelper.java:141)
	at com.mongodb.internal.operation.SyncOperationHelper.withSourceAndConnection(SyncOperationHelper.java:122)
	at com.mongodb.internal.operation.SyncOperationHelper.lambda$executeRetryableRead$4(SyncOperationHelper.java:186)
	at com.mongodb.internal.operation.SyncOperationHelper.lambda$decorateReadWithRetries$12(SyncOperationHelper.java:289)
	at com.mongodb.internal.async.function.RetryingSyncSupplier.get(RetryingSyncSupplier.java:67)
	at com.mongodb.internal.operation.SyncOperationHelper.executeRetryableRead(SyncOperationHelper.java:191)
	at com.mongodb.internal.operation.SyncOperationHelper.executeRetryableRead(SyncOperationHelper.java:173)
	at com.mongodb.internal.operation.AggregateOperationImpl.execute(AggregateOperationImpl.java:189)
	at com.mongodb.internal.operation.AggregateOperation.execute(AggregateOperation.java:153)
	at com.mongodb.internal.operation.AggregateOperation.execute(AggregateOperation.java:44)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.execute(MongoClientDelegate.java:153)
	at com.mongodb.client.internal.MongoIterableImpl.execute(MongoIterableImpl.java:130)
	at com.mongodb.client.internal.MongoIterableImpl.iterator(MongoIterableImpl.java:90)
	at com.mongodb.client.internal.MongoIterableImpl.forEach(MongoIterableImpl.java:116)
	at com.mongodb.client.internal.MongoIterableImpl.into(MongoIterableImpl.java:125)
	at com.mongodb.spark.sql.connector.schema.InferSchema.lambda$inferSchema$0(InferSchema.java:106)
	at com.mongodb.spark.sql.connector.config.AbstractMongoConfig.withClient(AbstractMongoConfig.java:182)
	at com.mongodb.spark.sql.connector.config.ReadConfig.withClient(ReadConfig.java:47)
	at com.mongodb.spark.sql.connector.schema.InferSchema.inferSchema(InferSchema.java:83)
	at com.mongodb.spark.sql.connector.MongoTableProvider.inferSchema(MongoTableProvider.java:60)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:90)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.loadV2Source(DataSourceV2Utils.scala:140)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:210)
	at scala.Option.flatMap(Option.scala:271)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:208)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


In [None]:
df = spark.read\
            .format("mongodb")\
            .option("uri","mongodb://ndtien_mongo:ndtien_mongo@mongodb:27017/movies_db.movies_name")\
            .option("database","movies_db")\
            .option("collection","movies_name")\
            .load();