IMPORT AND FUNCTIONS

In [31]:
import os
import configparser
import logging
import pandas as pd
import oracledb

def setup_logging(config):
    log_file = config.get('logging', 'log_file')
    log_level = config.get('logging', 'log_level')

    log_dir = os.path.dirname(log_file)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logging.basicConfig(
        filename=log_file,
        level=getattr(logging, log_level.upper()),
        format='%(asctime)s:%(levelname)s:%(message)s'
    )
    console = logging.StreamHandler()
    console.setLevel(logging.ERROR)
    formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

def fetch_data(user, password, dsn, query):
    try:
        connection = oracledb.connect(user=user, password=password, dsn=dsn)
        cursor = connection.cursor()
        cursor.execute(query)
        columns = [col[0] for col in cursor.description]
        data = cursor.fetchall()
        cursor.close()
        connection.close()
        df = pd.DataFrame(data, columns=columns)
        return df
    except Exception as e:
        logging.error(f"Error fetching data: {e}")
        raise e

def upsert_to_customers(df, app_user, app_password, app_dsn, key_columns, source_suffix, target_suffix):
    try:
        source_len = len(source_suffix)
        target_len = len(target_suffix)

        connection = oracledb.connect(user=app_user, password=app_password, dsn=app_dsn)
        cursor = connection.cursor()

        for _, row in df.iterrows():
            if not all([row[f"{key}{target_suffix}_exists"] for key in key_columns]):
                insert_columns = key_columns[:]
                insert_values = [row[key] for key in key_columns]
                placeholders = [f':{key}' for key in key_columns]

                for column in df.columns:
                    if column.endswith(source_suffix) and column[:-source_len] not in key_columns:
                        base_column = column[:-source_len]
                        insert_columns.append(base_column)
                        insert_values.append(row[column])
                        placeholders.append(f":{base_column}")

                insert_query = f"""
                    INSERT INTO APP.CUSTOMERS ({', '.join(insert_columns)})
                    VALUES ({', '.join(placeholders)})
                """

                insert_bind_dict = {base_column: value for base_column, value in zip(insert_columns, insert_values)}

                cursor.execute(insert_query, insert_bind_dict)
            else:
                update_columns = []
                bind_dict = {key: row[key] for key in key_columns}

                for column in df.columns:
                    if column.endswith(source_suffix):
                        base_column = column[:-source_len]
                        target_column = base_column + target_suffix
                        if target_column in df.columns:
                            if row[column] != row[target_column]:
                                update_columns.append(f"{base_column} = :{base_column}")
                                bind_dict[base_column] = row[column]

                if update_columns:
                    key_condition = ' AND '.join([f"{key} = :{key}" for key in key_columns])
                    update_query = f"UPDATE APP.CUSTOMERS SET {', '.join(update_columns)} WHERE {key_condition}"
                    cursor.execute(update_query, bind_dict)

        connection.commit()
        connection.close()
    except Exception as e:
        logging.error(f"Error during upsert operation: {e}")
        raise e

MAIN 

In [35]:
def main(env='production'):
    try:
        config = configparser.ConfigParser()
        config.read('config/config.ini')

        etl_section = f'{env}_etl'
        app_section = f'{env}_app'

        etl_user = config[etl_section]['username']
        etl_password = config[etl_section]['password']
        etl_dsn = config[etl_section]['dsn']
        app_user = config[app_section]['username']
        app_password = config[app_section]['password']
        app_dsn = config[app_section]['dsn']

        setup_logging(config)
        logging.info(f'Starting the database operations script in {env} environment.')

        etl_query = "SELECT * FROM ETL.S_CUSTOMERS"
        etl_data = fetch_data(etl_user, etl_password, etl_dsn, etl_query)
      
        app_query = "SELECT * FROM APP.CUSTOMERS"
        app_data = fetch_data(app_user, app_password, app_dsn, app_query)
        
        source_suffix = "_etl" # as this is oracle DB, usually name of the schema
        target_suffix = "_app"
        
        df_merged = etl_data.merge(app_data, on="CUSTOMER_ID", how="left", suffixes=(source_suffix, target_suffix), indicator=True)
        
        # Add a column to detect if CUSTOMER_ID exists in both dataframes
        df_merged['CUSTOMER_ID_app_exists'] = df_merged['_merge'] == 'both'
        
        print(df_merged)
        key_columns = ["CUSTOMER_ID"]  # list of key columns key_column = 'CUSTOMER_ID'
        # Perform the upsert operation
        upsert_to_customers(df_merged, app_user, app_password, app_dsn, key_columns, source_suffix, target_suffix)

        logging.info('Data synchronization between ETL.S_CUSTOMERS and APP.CUSTOMERS completed.')
    except Exception as e:
        logging.error(f"Error in main function: {e}")
        raise e

if __name__ == "__main__":
    main()

   CUSTOMER_ID EMAIL_ADDRESS_etl FULL_NAME_etl  REC_STS           REC_TMSTP  \
0          392                 1             2        1 2024-09-01 16:03:32   
1         1666                 3             4        1 2024-08-30 17:18:39   

  EMAIL_ADDRESS_app FULL_NAME_app _merge  CUSTOMER_ID_app_exists  
0      TESTfsfsdsfd        111111   both                    True  
1              ffda     222222222   both                    True  
