In [None]:
# main.py
import os
import pandas as pd
import psycopg2
import shutil
from Functions.utils import ConfigLoader, setup_logging
from Functions.ckan_manager import CKANManager
import json
import logging

def sanitize_column_name(name):
    """Sanitize column names to be compatible with PostgreSQL."""
    return ''.join(c if c.isalnum() or c == '_' else '_' for c in name).lower()

def infer_postgres_type(dtype):
    """Map pandas data types to PostgreSQL types."""
    type_mapping = {
        'int64': 'BIGINT',
        'float64': 'FLOAT',
        'object': 'TEXT',
        'datetime64[ns]': 'TIMESTAMP',
        'bool': 'BOOLEAN'
    }
    return type_mapping.get(str(dtype), 'TEXT')

def load_data_to_postgresql(conn_params, table_name, df):
    """Load data into PostgreSQL."""
    try:
        with psycopg2.connect(**conn_params) as conn:
            with conn.cursor() as cur:
                columns = [
                    f"{sanitize_column_name(col)} {infer_postgres_type(df[col].dtype)}"
                    for col in df.columns
                ]
                create_table_sql = f"""
                CREATE TABLE IF NOT EXISTS {table_name} (
                    {', '.join(columns)}
                )
                """
                cur.execute(create_table_sql)
                from io import StringIO
                output = StringIO()
                df.to_csv(output, sep='\t', header=False, index=False)
                output.seek(0)
                cur.copy_from(output, table_name, null="")
                conn.commit()
        return True
    except Exception as e:
        print(f"Failed to load data to PostgreSQL: {str(e)}")
        return False


def process_file(filename, config, metadata_template=None):
    """Process a single file with improved error handling and dataset management."""
    logger = logging.getLogger(__name__)
    file_path = os.path.join(config.pending_file_dir, filename)
    completed_file_path = os.path.join(config.completed_file_dir, filename)

    try:
        # Read the file
        if filename.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif filename.endswith('.xlsx'):
            df = pd.read_excel(file_path)
        else:
            logger.error(f"Unsupported file type: {filename}")
            return False, "Unsupported file type"

        # Sanitize table name
        table_name = f"datastore_{sanitize_column_name(os.path.splitext(filename)[0])}"

        # Load data into PostgreSQL
        if not load_data_to_postgresql(config.db_params, table_name, df):
            return False, "Failed to load data to PostgreSQL"

        # Initialize CKAN manager
        ckan_manager = CKANManager(config.ckan_api_url, config.ckan_api_key)

        # Prepare dataset payload
        dataset_payload = metadata_template.get("dataset", {}) if metadata_template else {}
        if not dataset_payload.get("name"):
            dataset_payload["name"] = sanitize_column_name(os.path.splitext(filename)[0])
        if not dataset_payload.get("title"):
            dataset_payload["title"] = os.path.splitext(filename)[0]

        # Create or update dataset
        dataset_id = ckan_manager.create_or_update_dataset(dataset_payload)
        if not dataset_id:
            return False, "Failed to create/update dataset"

        # Prepare resource payload
        resource_payload = metadata_template.get("resource", {}) if metadata_template else {}
        resource_payload["name"] = filename

        # Create or update resource
        if not ckan_manager.create_or_update_resource(dataset_id, resource_payload, file_path):
            return False, "Failed to create/update resource"

        # Move processed file to completed directory
        shutil.move(file_path, completed_file_path)

        # If there's a metadata file, move it too
        if metadata_template:
            metadata_matches = [
                f for f in os.listdir(config.pending_metadata_dir)
                if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
            ]
            if metadata_matches:
                latest_metadata = max(metadata_matches)
                metadata_path = os.path.join(config.pending_metadata_dir, latest_metadata)
                completed_metadata_path = os.path.join(config.completed_metadata_dir, latest_metadata)
                shutil.move(metadata_path, completed_metadata_path)
                logger.info(f"Moved metadata file {latest_metadata} to {completed_metadata_path}")
            else:
                logger.warning(f"No metadata file found for {filename}")
        else:
            logger.warning("No metadata template provided. Skipping metadata file movement.")

        logger.info(f"Successfully processed {filename}")
        return True, "Successfully processed"

    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        logger.error(f"{error_msg} while processing {filename}")
        return False, error_msg


def run():
    """Run the loader to process all pending files."""
    logger = setup_logging()
    config = ConfigLoader()
    ckan_manager = CKANManager(config.ckan_api_url, config.ckan_api_key)
    
    logger.info("Starting file processing...")
    files = [f for f in os.listdir(config.pending_file_dir) 
             if os.path.isfile(os.path.join(config.pending_file_dir, f)) 
             and f.endswith((".csv", ".xlsx"))]
    
    logger.info(f"Found {len(files)} files to process")
    
    for filename in files:
        try:
            metadata_template = None
            metadata_matches = [
                f for f in os.listdir(config.pending_metadata_dir) 
                if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
            ]
            if metadata_matches:
                latest_metadata = max(metadata_matches)
                metadata_path = os.path.join(config.pending_metadata_dir, latest_metadata)
                with open(metadata_path, "r", encoding="utf-8") as f:
                    metadata_template = json.load(f)
            
            logger.info(f"Processing file: {filename}")
            success, notes = process_file(filename, config, metadata_template)
            ckan_manager.add_to_report(filename, success, notes)
            
        except Exception as e:
            error_msg = f"An unexpected error occurred: {str(e)}"
            logger.error(f"{error_msg} while processing {filename}")
            ckan_manager.add_to_report(filename, False, error_msg)
    
    # Generate final report
    ckan_manager.generate_report(config.completed_report_dir)
    
if __name__ == "__main__":
    run()