In [4]:
import os
import pandas as pd
import psycopg2
import requests
import shutil
import logging
import json
from datetime import datetime
import configparser

class CKANDatastoreLoader:
    def __init__(self, config_path='config.ini'):
        self.config = configparser.ConfigParser()
        self.config.read(config_path)
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s: %(message)s',
            handlers=[
                logging.FileHandler('datastore_loader.log', encoding='utf-8'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        self.db_params = {
            'dbname': self.config.get('Database', 'dbname'),
            'user': self.config.get('Database', 'user'),
            'password': self.config.get('Database', 'password'),
            'host': self.config.get('Database', 'host')
        }
        
        self.ckan_api_url = self.config.get('CKAN', 'api_url')
        self.ckan_api_key = self.config.get('CKAN', 'api_key')
        
        self.pending_dir = self.config.get('Paths', 'pending_dir')
        self.completed_dir = self.config.get('Paths', 'completed_dir')
        self.metadata_dir = os.path.join(self.pending_dir, 'metadata')
        
        for path in [self.pending_dir, self.completed_dir, self.metadata_dir]:
            os.makedirs(path, exist_ok=True)

    def _sanitize_column_name(self, name):
        return ''.join(c if c.isalnum() or c == '_' else '_' for c in name).lower()

    def _infer_postgres_type(self, dtype):
        type_mapping = {
            'int64': 'BIGINT',
            'float64': 'FLOAT',
            'object': 'TEXT',
            'datetime64[ns]': 'TIMESTAMP',
            'bool': 'BOOLEAN'
        }
        return type_mapping.get(str(dtype), 'TEXT')

    def process_file(self, filename, metadata_template=None):
        try:
            file_path = os.path.join(self.pending_dir, filename)
            
            if filename.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif filename.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                self.logger.error(f"Unsupported file type: {filename}")
                return False

            table_name = f"datastore_{self._sanitize_column_name(os.path.splitext(filename)[0])}"
            
            with psycopg2.connect(**self.db_params) as conn:
                with conn.cursor() as cur:
                    columns = [
                        f"{self._sanitize_column_name(col)} {self._infer_postgres_type(df[col].dtype)}"
                        for col in df.columns
                    ]
                    create_table_sql = f"""
                    CREATE TABLE IF NOT EXISTS {table_name} (
                        {', '.join(columns)}
                    )
                    """
                    cur.execute(create_table_sql)
                    
                    from io import StringIO
                    output = StringIO()
                    df.to_csv(output, sep='\t', header=False, index=False)
                    output.seek(0)
                    cur.copy_from(output, table_name, null="")
                    
                    conn.commit()
                    self.logger.info(f"Successfully loaded {filename} to {table_name}")

            try:
                dataset_payload = metadata_template.get("dataset", {})
                dataset_payload["tags"] = (
                    dataset_payload.get("tags", []) if isinstance(dataset_payload.get("tags", [])[0], dict)
                    else [{"name": tag} for tag in dataset_payload.get("tags", [])]
                )

                existing_dataset_response = requests.get(
                    f"{self.ckan_api_url}/package_show",
                    params={"id": dataset_payload["name"]},
                    headers={"Authorization": self.ckan_api_key}
                )

                if existing_dataset_response.status_code == 200:
                    dataset_payload["id"] = existing_dataset_response.json()["result"]["id"]
                    create_dataset_response = requests.post(
                        f"{self.ckan_api_url}/package_update", 
                        json=dataset_payload,
                        headers={
                            "Authorization": self.ckan_api_key,
                            "Content-Type": "application/json"
                        }
                    )
                else:
                    create_dataset_response = requests.post(
                        f"{self.ckan_api_url}/package_create", 
                        json=dataset_payload,
                        headers={
                            "Authorization": self.ckan_api_key,
                            "Content-Type": "application/json"
                        }
                    )

                create_dataset_response.raise_for_status()
                dataset_result = create_dataset_response.json()
                dataset_id = dataset_result["result"]["id"]

                with open(file_path, "rb") as f:
                    resource_payload = metadata_template.get("resource", {})
                    resource_payload["package_id"] = dataset_id

                    create_resource_response = requests.post(
                        f"{self.ckan_api_url}/resource_create", 
                        data=resource_payload,
                        files={"upload": (filename, f)},
                        headers={"Authorization": self.ckan_api_key}
                    )

                    create_resource_response.raise_for_status()
                    resource_result = create_resource_response.json()
                    resource_id = resource_result["result"]["id"]

                # **Fix: Ensure file is closed before moving**
                del f

                completed_path = os.path.join(self.completed_dir, filename)
                shutil.move(file_path, completed_path)
                self.logger.info(f"Moved {filename} to {completed_path}")

                # **Fix: Update CKAN datastore schema with field descriptions**
                schema_fields = metadata_template["resource"].get("schema", {}).get("fields", [])
                if schema_fields:
                    schema_update_payload = {
                        "resource_id": resource_id,
                        "fields": schema_fields,
                        "force": True  # Override read-only restriction
                    }

                    schema_update_response = requests.post(
                        f"{self.ckan_api_url}/datastore_create",
                        json=schema_update_payload,
                        headers={"Authorization": self.ckan_api_key}
                    )


                    if schema_update_response.status_code == 200:
                        self.logger.info(f"Updated datastore schema for {filename} with field descriptions.")
                    else:
                        self.logger.error(f"Failed to update schema: {schema_update_response.text}")

                return True

            except requests.exceptions.RequestException as e:
                self.logger.error(f"API request failed: {e}")
                if hasattr(e, "response"):
                    self.logger.error(f"Response content: {e.response.text}")
                return False
            except Exception as e:
                self.logger.error(f"Unexpected error during CKAN registration: {str(e)}")
                return False

        except Exception as e:
            self.logger.error(f"An error occurred while processing {filename}: {str(e)}")
            return False

    def run(self, use_metadata_templates=True):
        self.logger.info("Starting file processing...")
        files = [f for f in os.listdir(self.pending_dir) 
                 if os.path.isfile(os.path.join(self.pending_dir, f)) 
                 and f.endswith((".csv", ".xlsx"))]
        
        self.logger.info(f"Found {len(files)} files to process")
        
        for filename in files:
            try:
                metadata_template = None
                if use_metadata_templates:
                    metadata_matches = [
                        f for f in os.listdir(self.metadata_dir) 
                        if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
                    ]
                    
                    if metadata_matches:
                        latest_metadata = max(metadata_matches)
                        metadata_path = os.path.join(self.metadata_dir, latest_metadata)
                        with open(metadata_path, "r", encoding="utf-8") as f:
                            metadata_template = json.load(f)

                self.logger.info(f"Processing file: {filename}")
                if self.process_file(filename, metadata_template):
                    self.logger.info(f"Successfully processed {filename}")
                else:
                    self.logger.error(f"Failed to process {filename}")
            
            except Exception as e:
                self.logger.error(f"An unexpected error occurred while processing {filename}: {str(e)}")

def update_schema_dictionary(self, resource_id, schema_fields):
    """
    Update the data dictionary (column descriptions) in CKAN datastore.
    """
    dictionary_entries = []
    for field in schema_fields:
        if "description" in field and field["description"]:  # Ensure there's a description
            dictionary_entries.append({
                "resource_id": resource_id,
                "term": field["id"],  # Column name
                "definition": field["description"]  # Description
            })

    if dictionary_entries:
        dictionary_update_payload = {
            "resource_id": resource_id,
            "terms": dictionary_entries
        }

        response = requests.post(
            f"{self.ckan_api_url}/datastore_dictionary",
            json=dictionary_update_payload,
            headers={"Authorization": self.ckan_api_key}
        )

        if response.status_code == 200:
            print("✅ Data dictionary updated successfully")
        else:
            print(f"❌ Failed to update dictionary: {response.text}")


if __name__ == "__main__":
    loader = CKANDatastoreLoader()
    loader.run(use_metadata_templates=True)


2025-02-07 13:25:14,578 - INFO: Starting file processing...
2025-02-07 13:25:14,578 - INFO: Found 1 files to process
2025-02-07 13:25:14,578 - INFO: Processing file: CDP_-__Economic_Development2.csv
2025-02-07 13:25:14,779 - INFO: Successfully loaded CDP_-__Economic_Development2.csv to datastore_cdp____economic_development2
2025-02-07 13:25:14,783 - ERROR: Failed to process CDP_-__Economic_Development2.csv
