---

#### Importing necessary libraries and packages


In [20]:
import os
import pandas as pd
import json
from pymongo import MongoClient
from dotenv import load_dotenv
import psycopg2
from psycopg2 import sql
import datetime

In [21]:
class MongoToPostgresELT:

    # --------------------------------------------------------
    # Initialize Environment Variables and Database Connection
    def __init__(self):
        """
        Initializes the MongoToPostgresETL class.
        Loads environment variables and sets up MongoDB and PostgreSQL connection parameters.
        """

        # Load environment variables from .env file
        load_dotenv()

        self.mongo_url = os.getenv("MONGO_DB_URL")
        self.mongo_db_database_name = os.getenv("MONGODB_DB_NAME")
        self.mongo_client = MongoClient(self.mongo_url)
        self.db = self.mongo_client[
            self.mongo_db_database_name
        ]  # Initialize the database here

        # PostgreSQL connection parameters from .env
        self.pg_host = os.getenv("PG_HOST")
        self.pg_database = os.getenv("PG_DATABASE")
        self.pg_user = os.getenv("PG_USER")
        self.pg_password = os.getenv("PG_PASSWORD")
        self.pg_port = os.getenv("PG_PORT")
        self.pg_connection = None

    def get_mongo_client(self) -> MongoClient:
        """
        Establishes and returns a MongoDB client connection.

        Returns:
        MongoClient: MongoDB client instance."""

        if self.mongo_client is None:
            self.mongo_client = MongoClient(self.mongo_url)
        return self.mongo_client

    def connect_to_postgres(self):
        """
        Establish a connection to the PostgreSQL database.

        Returns:
        connection: A connection object to the PostgreSQL database."""

        try:
            connection = psycopg2.connect(
                dbname=os.getenv("PG_DATABASE"),
                user=os.getenv("PG_USER"),
                password=os.getenv("PG_PASSWORD"),
                host=os.getenv("PG_HOST"),
                port=os.getenv("PG_PORT"),
            )
            print("Connection to PostgreSQL established successfully.")
            return connection
        except Exception as e:
            print(f"Failed to connect to PostgreSQL: {e}")
            return None

    # ---------------------------------------------------------
    # load callection as dataframe to work on them as required.

    def load_collection_as_dataframe(self, collection_name: str) -> pd.DataFrame:
        """
        Converts a MongoDB collection into a pandas DataFrame.

        Parameters:
        collection_name (str): The name of the MongoDB collection.
        db_name (str): The MongoDB database name.

        Returns:
        pd.DataFrame: A DataFrame containing the MongoDB collection data."""

        client = self.get_mongo_client()
        db = client[self.mongo_db_database_name]
        collection = db[collection_name]
        data = list(collection.find())

        # Convert the data to a DataFrame
        df = pd.DataFrame(data)

        # Check if '_id' column exists and drop it if it does
        if "_id" in df.columns:
            df = df.drop(columns=["_id"])

        return df

    def load_dataframe_to_postgres(
        self, df: pd.DataFrame, table_name: str, json_columns: list = []
    ):
        """
        Writes a pandas DataFrame to a PostgreSQL table, with support for JSON fields.

        Parameters:
        df (pd.DataFrame): The DataFrame to be written to PostgreSQL.
        table_name (str): The name of the PostgreSQL table where data will be inserted.
        json_columns (list): List of column names in the DataFrame that contain JSON data.
                            These columns will be converted to JSON strings and cast to 'jsonb' in PostgreSQL.
        """
        # Establish connection for this operation
        connection = self.connect_to_postgres()  # Always establish a new connection
        cursor = connection.cursor()

        # Handle conversion of specified columns to JSON strings
        for column in json_columns:
            df[column] = df[column].apply(json.dumps)

        # Prepare the columns and placeholders for SQL query
        columns = df.columns.tolist()

        # Create SQL query for insertion, casting JSON columns to jsonb
        insert_query = sql.SQL(
            """
            INSERT INTO {} ({}) 
            VALUES ({})"""
        ).format(
            sql.Identifier(table_name),
            sql.SQL(", ").join(map(sql.Identifier, columns)),
            sql.SQL(", ").join(
                sql.SQL("%s::jsonb") if col in json_columns else sql.Placeholder()
                for col in columns
            ),
        )

        # Convert DataFrame rows to tuples
        data_tuples = [tuple(row) for row in df.itertuples(index=False)]

        try:
            # Execute the insertion query
            cursor.executemany(insert_query, data_tuples)
            connection.commit()
            print(f"Data successfully loaded into {table_name} table.")
        except Exception as error:
            print(f"Error inserting data into PostgreSQL: {error}")
            connection.rollback()
        finally:
            cursor.close()
            connection.close()

    # --------------------------------------------------
    # function to perfrom full load and incremental load

    def perform_full_load(self, dataframes: dict):
        """
        Loads all DataFrames from the provided dictionary into corresponding PostgreSQL tables.

        Parameters:
        dataframes (dict): A dictionary where keys are DataFrame names (e.g., 'customers__df')
                           and values are the corresponding pandas DataFrames."""

        for df_name, df in dataframes.items():
            table_name = df_name.replace(
                "__df", ""
            )  # Remove '__df' to get the table name

            # Determine if any JSON columns need special handling
            json_columns = (
                []
            )  # Define any specific columns that are JSON formatted if needed

            # Check for JSON columns in the DataFrame
            if "new_loan_terms" in df.columns or "restructure_terms" in df.columns:
                json_columns = [
                    "new_loan_terms",
                    "restructure_terms",
                ]  # Adjust based on your DataFrame structure

            try:
                self.load_dataframe_to_postgres(
                    df, f"tbl_{table_name}", json_columns=json_columns
                )  # Call the loading function
                print(f"Successfully loaded data into table: tbl_{table_name}")

            except Exception as e:
                print(f"Error loading data into table {table_name}: {e}")

    def incremental_load(
        self,
        old_dataframe: pd.DataFrame,
        collection_name: str,
        table_name: str,
        load_type: str,
        added_at_col: str = "added_at",
        modified_at_col: str = "modified_at",
    ):
        """
        Performs an incremental load of data from a MongoDB collection to a PostgreSQL table.

        Parameters:
        collection_name (str): The name of the MongoDB collection to load data from.
        table_name (str): The name of the PostgreSQL table to load data into.
        load_type (str): The type of load operation ('insertion' or 'updation').
        added_at_col (str): The name of the column used for new records.
        modified_at_col (str): The name of the column used for updated records."""

        # Mapping of table names to their unique identifier columns
        unique_id_mapping = {
            "tbl_customers": "customer_id",
            "tbl_loan_types": "loan_type_id",
            "tbl_loan_applications": "loan_id",
            "tbl_loan_repayments": "repayment_id",
            "tbl_loan_history": "history_id",
            "tbl_loan_collateral": "collateral_id",
            "tbl_loan_restructuring": "restructuring_id",
            "tbl_loan_disbursements": "disbursement_id",
        }

        # Determine last loaded state
        old_load_last_state = old_dataframe[added_at_col].max()
        old_update_last_state = old_dataframe[modified_at_col].max()

        # # Fetch the latest DataFrame from the MongoDB collection
        df = self.load_collection_as_dataframe(collection_name)

        if load_type == "insertion":
            #     for new records
            new_records = df[df[added_at_col] > old_load_last_state]
            print(new_records)
            if not new_records.empty:
                self.load_dataframe_to_postgres(new_records, table_name)

        elif load_type == "updation":
            # Filter for updated records
            updated_records = df[df[modified_at_col] > old_update_last_state]
            if not updated_records.empty:
                unique_id_col = unique_id_mapping.get(table_name)
                if unique_id_col is None:
                    print(f"No unique identifier mapping found for table: {table_name}")
                    return

                for index, row in updated_records.iterrows():
                    self.update_record_in_postgres(row, table_name, unique_id_col)

    # -------------------------------------------------------------------
    # Function to Normalize tables and load into their repspective tables

    def normalize_nested_fields(
        self, df: pd.DataFrame, table_name: str, nested_fields: dict
    ):
        """
        Normalizes nested fields in a DataFrame and inserts data into PostgreSQL.

        Parameters:
        df (pd.DataFrame): The DataFrame containing data with nested fields.
        table_name (str): The name of the main PostgreSQL table.
        nested_fields (dict): A dictionary where keys are nested fields in df,
                            and values are the corresponding table names in PostgreSQL.
        """
        # Iterate over each nested field
        for nested_field, nested_table_name in nested_fields.items():
            if nested_field in df.columns:
                # Normalize the nested field into a separate DataFrame
                nested_df = pd.json_normalize(df[nested_field])
                nested_df.columns = [
                    col.replace(".", "_") for col in nested_df.columns
                ]  # Flatten column names

                # Add unique IDs for the nested records
                nested_df[f"{nested_table_name}_id"] = range(1, len(nested_df) + 1)

                # Merge the IDs back to the main DataFrame
                df = df.join(nested_df[f"{nested_table_name}_id"])

                # Insert nested data into its respective table
                self.load_dataframe_to_postgres(nested_df, nested_table_name)

                # Drop original nested column from main DataFrame
                df = df.drop(columns=[nested_field])

        # Insert the main table with foreign keys for the nested tables
        self.load_dataframe_to_postgres(df, table_name)

    # -----------------------------------------------------------
    # Helper Function to for insertion and updation into MongoDB

    def insert_document(self, collection_name: str, document: dict):
        """
        Inserts a document into the specified MongoDB collection only if
        a document with the same unique identifier does not already exist.

        Parameters:
        collection_name (str): The name of the MongoDB collection.
        document (dict): The document to insert into the collection.
        """

        unique_id_mapping = {
            "customers": "customer_id",
            "loan_types": "loan_type_id",
            "loan_applications": "loan_id",
            "loan_repayments": "repayment_id",
            "loan_history": "history_id",
            "loan_collateral": "collateral_id",
            "loan_restructuring": "restructuring_id",
            "loan_disbursements": "disbursement_id",
        }

        # Ensure collection has a mapped unique identifier
        unique_id_field = unique_id_mapping.get(collection_name)
        if unique_id_field is None:
            print(
                f"No unique identifier mapping found for collection '{collection_name}'. Document not inserted."
            )
            return

        # Connect to collection
        collection = self.db[collection_name]

        # Check for existence of the document by unique identifier
        if unique_id_field in document:
            existing_doc = collection.find_one(
                {unique_id_field: document[unique_id_field]}
            )
            if existing_doc:
                print(
                    f"Document with {unique_id_field} = {document[unique_id_field]} already exists in collection '{collection_name}'."
                )
                return

        # Insert the document if unique identifier check passes
        result = collection.insert_one(document)
        print(f"Document inserted with ID: {result.inserted_id}")

    def update_document(self, collection_name: str, query: dict, update: dict):
        """
        Updates an existing document in the specified MongoDB collection.

        Parameters:
        collection_name (str): The name of the MongoDB collection.
        query (dict): The query to identify the document to update.
        update (dict): The updates to apply to the document."""

        collection = self.db[collection_name]
        result = collection.update_one(query, {"$set": update})
        if result.matched_count > 0:
            print(f"Document updated: {result.modified_count} document(s) modified.")
        else:
            print("No document found matching the query.")

    def update_record_in_postgres(
        self, record: pd.Series, table_name: str, unique_id_col: str
    ):
        """
        Updates an existing record in the PostgreSQL table based on a unique identifier.

        Parameters:
        record (pd.Series): The record to update.
        table_name (str): The name of the PostgreSQL table to update.
        unique_id_col (str): The name of the unique identifier column used for the update.
        """

        # Establish connection for this operation
        connection = self.connect_to_postgres()
        cursor = connection.cursor()

        # Prepare the SQL UPDATE statement
        set_clause = ", ".join(
            [f"{col} = %s" for col in record.index if col != unique_id_col]
        )  # Exclude unique identifier
        update_query = f"""
            UPDATE {table_name} 
            SET {set_clause} 
            WHERE {unique_id_col} = %s
        """

        # Values to be updated
        values = tuple(record[col] for col in record.index if col != unique_id_col) + (
            record[unique_id_col],
        )

        try:
            # Execute the update query
            cursor.execute(update_query, values)
            connection.commit()
            print(
                f"Record with {unique_id_col} {record[unique_id_col]} updated successfully."
            )
        except Exception as error:
            print(f"Error updating record in PostgreSQL: {error}")
            connection.rollback()
        finally:
            cursor.close()
            connection.close()

In [22]:
elt = MongoToPostgresELT()

In [23]:
# List of MongoDB collections
collections = [
    "customers",
    "loan_types",
    "loan_applications",
    "loan_repayments",
    "loan_history",
    "loan_collateral",
    "loan_restructuring",
    "loan_disbursements",
]

# Dictionary to hold DataFrames
collections_dict = {}

# Loop through each collection name and load it into a DataFrame
for collection in collections:
    df_name = f"{collection}"  # Create a dynamic variable name
    collections_dict[df_name] = elt.load_collection_as_dataframe(
        collection
    )  # Load DataFrame

# Accessing the DataFrames
customers__df = collections_dict["customers"]
loan_types__df = collections_dict["loan_types"]
loan_applications__df = collections_dict["loan_applications"]
loan_repayments__df = collections_dict["loan_repayments"]
loan_history__df = collections_dict["loan_history"]
loan_collateral__df = collections_dict["loan_collateral"]
loan_restructuring__df = collections_dict["loan_restructuring"]
loan_disbursements__df = collections_dict["loan_disbursements"]

In [24]:
loan_restructuring__df.head()

Unnamed: 0,restructuring_id,loan_id,new_loan_terms,restructure_terms,added_at,modified_at
0,445645329959805120,663194010682136064,"{'interest_rate': 5, 'repayment_period_in_mont...","{'reason': 'Financial difficulties', 'new_sche...",2024-10-24,2024-10-24
1,618227701498760192,919140988712724352,"{'interest_rate': 3, 'repayment_period_in_mont...","{'reason': 'Unexpected expenses', 'new_schedul...",2024-10-24,2024-10-24
2,189098082925672288,535611310051475456,"{'interest_rate': 4, 'repayment_period_in_mont...","{'reason': 'Unexpected expenses', 'new_schedul...",2024-10-24,2024-10-24
3,477779993814011008,863687840789048960,"{'interest_rate': 4, 'repayment_period_in_mont...","{'reason': 'Interest rate reduction', 'new_sch...",2024-10-24,2024-10-24
4,620276799861859456,375253958161880512,"{'interest_rate': 4, 'repayment_period_in_mont...","{'reason': 'Interest rate reduction', 'new_sch...",2024-10-24,2024-10-24


In [25]:
elt.perform_full_load(collections_dict)

Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_customers table.
Successfully loaded data into table: tbl_customers
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_loan_types table.
Successfully loaded data into table: tbl_loan_types
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_loan_applications table.
Successfully loaded data into table: tbl_loan_applications
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_loan_repayments table.
Successfully loaded data into table: tbl_loan_repayments
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_loan_history table.
Successfully loaded data into table: tbl_loan_history
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_loan_collateral table.
Successfully loaded data into table: tbl_loan_collateral
Connection to PostgreSQL establish

In [8]:
document = {
    "customer_id": 619387217110,
    "first_name": "Ashlan",
    "last_name": "Birdall",
    "gender": "Female",
    "age": 78,
    "employment_status": "unemployed",
    "income_level": "high",
    "location": "Indianapolis",
    "joined_date": "2018-09-11",
}

""" 
- This function will insert a document into the customers collection into the MongoDB.
- This also be added by adding can also be achieved using the MongDB compass. 
- But we decided to do this programmatically in order to avoid switching from different environment."""

elt.insert_document("customers", document)

Document inserted with ID: 671b33052389b0eb19c29cb0


In [34]:
elt.incremental_load(customers__df, "customers", "tbl_customers", "insertion")

       customer_id first_name   last_name  gender  age employment_status  \
6000  569676395005      Becka  Wennington  Female   45          employed   

     income_level       location joined_date                added_at  \
6000          low  New York City  2018-09-01 2024-10-25 00:53:40.161   

                 modified_at  
6000 2024-10-25 00:53:40.161  
Connection to PostgreSQL established successfully.
Data successfully loaded into tbl_customers table.


In [39]:
elt.update_document(
    "customers", {"customer_id": 569676395005}, {"income_level": "high"}
)

Document updated: 1 document(s) modified.


In [46]:
elt.incremental_load(customers__df, "customers", "tbl_customers", "updation")

Connection to PostgreSQL established successfully.
Record with customer_id 569676395005 updated successfully.


Unnamed: 0,restructuring_id,loan_id,new_loan_terms,restructure_terms
0,445645329959805120,663194010682136064,"{""interest_rate"": 5, ""repayment_period_in_mont...","{""reason"": ""Financial difficulties"", ""new_sche..."
1,618227701498760192,919140988712724352,"{""interest_rate"": 3, ""repayment_period_in_mont...","{""reason"": ""Unexpected expenses"", ""new_schedul..."
2,189098082925672288,535611310051475456,"{""interest_rate"": 4, ""repayment_period_in_mont...","{""reason"": ""Unexpected expenses"", ""new_schedul..."
3,477779993814011008,863687840789048960,"{""interest_rate"": 4, ""repayment_period_in_mont...","{""reason"": ""Interest rate reduction"", ""new_sch..."
4,620276799861859456,375253958161880512,"{""interest_rate"": 4, ""repayment_period_in_mont...","{""reason"": ""Interest rate reduction"", ""new_sch..."
