In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import uuid
from datetime import datetime
from sqlalchemy import create_engine
from import_mapping import * 

db_config = {
    'dbname': 'import_final',
    'user': 'postgres',
    'password': '123',
    'host': 'localhost',
    'port': '5432'
}

db_url = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['dbname']}"
engine = create_engine(db_url)


db_config_1 = {
    'dbname': 'meb',
    'user': 'postgres',
    'password': 'MyTTT%401234',
    'host': '203.154.82.165',
    'port': '5432'
}
db_url_1 = f"postgresql://{db_config_1['user']}:{db_config_1['password']}@{db_config_1['host']}:{db_config_1['port']}/{db_config_1['dbname']}"
engine_1 = create_engine(db_url_1)

SGN_EXTENSION = ".sgn"
XML_EXTENSION = ".xml"

def generate_unique_id():
    return str(uuid.uuid4())

def process_element_xml(element, namespaces, parent_prefix=""):
    data = {}
    for child in element:
        tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
        full_tag = f"{parent_prefix}_{tag}" if parent_prefix else tag
        text = child.text.strip() if child.text else None
        if text:
            data[full_tag] = text
        child_data = process_element_xml(child, namespaces, parent_prefix=full_tag)
        data.update(child_data)
    return data

def get_namespaces(xml_file):
    namespaces = {}
    events = "start", "start-ns"
    for event, elem in ET.iterparse(xml_file, events):
        if event == 'start-ns':
            namespaces[elem[0]] = elem[1]
        elif event == 'start':
            break
    return namespaces

def process_file(full_path):
    document_list = []

    try:
        namespaces = get_namespaces(full_path)
        with open(full_path, 'r', encoding='utf-8') as file: 
            tree = ET.parse(file)
        root = tree.getroot()

        for document_element in root.findall('.//DocumentControl', namespaces):
            document_data = process_element_xml(document_element, namespaces)
            document_data['ic_filename'] = os.path.basename(full_path)
            date_time_str = os.path.basename(full_path).split("_")[3].split(".")[0]
            document_data['ic_created_date'] = datetime.strptime(date_time_str, "%Y%m%d%H%M%S")
            document_data['ic_id'] = generate_unique_id()
            document_list.append(document_data)

        return document_list

    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    except Exception as e:
        print(f"Error processing file {full_path}: {e}")

    return document_list



In [None]:
def save_to_database(engine, document_data):
    if document_data:
        df_document = pd.DataFrame(document_data)  
        df_document.rename(columns=column_mapping, inplace=True)
        df_document.to_sql('import_control', engine, if_exists='append', index=False)

# pathFolder = r"C:\Users\rinlapas\Desktop\202309_September_sgn"

for day_folder in sorted(os.listdir(pathFolder)):
    day_path = os.path.join(pathFolder, day_folder)
    if os.path.isdir(day_path):
        print(day_path)
        for root, dirs, files in os.walk(day_path):
            for filename in files:
                if filename.startswith('ebxml_IMDECL') and filename.endswith(SGN_EXTENSION):
                    full_path = os.path.join(root, filename)
                    document_data = process_file(full_path)

                    save_to_database(engine, document_data)

In [None]:
med_query = "SELECT \"fileName\", \"messageId\" FROM meb_sent"
med_df = pd.read_sql(med_query, engine_1)

control_query = "SELECT * FROM import_control"
import_control_df = pd.read_sql(control_query, engine)

res_query = "SELECT * FROM response_import"
response_import_df = pd.read_sql(res_query, engine)

In [None]:
ms_filename_to_id = med_df.set_index('fileName')['messageId'].to_dict()

import_control_df['ic_ms_messageid'] = import_control_df['ic_filename'].str.replace(r'\.sgn$', '', regex=True).map(ms_filename_to_id)

import_control_df['ic_filename'] = import_control_df['ic_filename'].str.replace(r'\.sgn$', '', regex=True) + '.sgn'


In [None]:
# Step 1: Ensure 'ic_created_date' is in datetime format
import_control_df['ic_created_date'] = pd.to_datetime(import_control_df['ic_created_date'], errors='coerce')

# Step 2: Check 'ic_ms_messageid' column
import_control_df['ic_is_use'] = import_control_df['ic_ms_messageid'].notnull()

# Step 3: Process rows where 'ic_ms_messageid' is empty
empty_ic_ms_messageid = import_control_df[import_control_df['ic_ms_messageid'].isnull()]

for ic_reference_number, group in empty_ic_ms_messageid.groupby('ic_reference_number'):
    if len(group) == 1:
        # Case 2.1: 'ic_reference_number' is unique
        import_control_df.loc[group.index, 'ic_is_use'] = True
    else:
        # Case 2.2: 'ic_reference_number' has duplicates
        # Find the most recent 'ic_created_date' in the group
        most_recent_index = group['ic_created_date'].idxmax()
        import_control_df.loc[group.index, 'ic_is_use'] = False
        import_control_df.loc[most_recent_index, 'ic_is_use'] = True

# Convert boolean values to 't' and 'f'
import_control_df['ic_is_use'] = import_control_df['ic_is_use'].replace({True: 't', False: 'f'})

# Step 4: Find duplicates with and without 'ic_ms_messageid'
duplicates = import_control_df[import_control_df.duplicated('ic_reference_number', keep=False)]

for ic_reference_number, group in duplicates.groupby('ic_reference_number'):
    if group['ic_ms_messageid'].isnull().any() and group['ic_ms_messageid'].notnull().any():
        import_control_df.loc[group[group['ic_ms_messageid'].isnull()].index, 'ic_is_use'] = 'f'

# Step 5: Map 'message_id' of 'import_control_df' with 'response_import_df'
# Select only non-empty 'message_id' in both dataframes
non_empty_doc = import_control_df[import_control_df['ic_ms_messageid'].notnull()]
non_empty_xml = response_import_df[response_import_df['res_message_id'].notnull()]

# Create a set of message_ids from response_import_df
xml_message_ids = set(non_empty_xml['res_message_id'])

# Update 'is_use' in import_control_df if 'message_id' matches in response_import_df
import_control_df.loc[import_control_df['ic_ms_messageid'].isin(xml_message_ids), 'ic_is_use'] = 't'


In [None]:
import_control_df[import_control_df['ic_reference_number'] == 'DACP000018631']


In [None]:
# import_control_df.to_csv('import_control_df_3.csv', index=False)
# import_control_df.to_sql('import_control', engine, if_exists='append', index=False)


In [None]:
# from sqlalchemy import text

# def update_database(cancel_df, table_name, columns_1, columns_2, pk_col, engine):
#     def update_target(conn, cancel_row):
#         update_query = text(f"""
#             UPDATE {table_name}
#             SET {columns_1} = :columns_1,
#                 {columns_2} = :columns_2
#             WHERE {pk_col} = :pk_col
#         """)
#         # Convert boolean to 't' or 'f'
#         columns_1_value = 't' if cancel_row[columns_1] else 'f'
#         conn.execute(update_query, {
#             'columns_1': columns_1_value,
#             'columns_2': cancel_row[columns_2],
#             'pk_col': cancel_row[pk_col]
#         })

#     # Remove NaN values from columns_1 and columns_2
#     cancel_df = cancel_df.dropna(subset=[columns_1, columns_2])

#     with engine.connect() as conn:
#         trans = conn.begin()
#         try:
#             for _, cancel_row in cancel_df.iterrows():
#                 update_target(conn, cancel_row)
#             trans.commit()
#             print(f"Data successfully updated in {table_name}.")
#         except Exception as e:
#             trans.rollback()
#             print(f"An error occurred while updating {table_name}: {e}")

# update_database(import_control_df, 'import_control', 'ic_is_use', 'ic_ms_messageid', 'ic_filename', engine)