In [33]:
import os
import pandas as pd
import psycopg2
import requests
import shutil
import logging
import json
from datetime import datetime
import configparser

class CKANDatastoreLoader:
    def __init__(self, config_path='config.ini'):
        # Load configuration
        self.config = configparser.ConfigParser()
        self.config.read(config_path)
        
        # Setup logging with UTF-8 encoding to handle special characters
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s: %(message)s',
            handlers=[
                logging.FileHandler('datastore_loader.log', encoding='utf-8'),
                logging.StreamHandler()  # Also print to console
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Database connection parameters
        self.db_params = {
            'dbname': self.config.get('Database', 'dbname'),
            'user': self.config.get('Database', 'user'),
            'password': self.config.get('Database', 'password'),
            'host': self.config.get('Database', 'host')
        }
        
        # CKAN API parameters
        self.ckan_api_url = self.config.get('CKAN', 'api_url')
        self.ckan_api_key = self.config.get('CKAN', 'api_key')
        
        # Paths
        self.pending_dir = self.config.get('Paths', 'pending_dir')
        self.completed_dir = self.config.get('Paths', 'completed_dir')
        self.metadata_dir = os.path.join(self.pending_dir, 'metadata')
        
        # Create directories if they don't exist
        for path in [self.pending_dir, self.completed_dir, self.metadata_dir]:
            os.makedirs(path, exist_ok=True)

    def _sanitize_column_name(self, name):
        """Sanitize column names for PostgreSQL and CKAN"""
        return ''.join(c if c.isalnum() or c == '_' else '_' for c in name).lower()

    def _infer_postgres_type(self, dtype):
        """Convert pandas dtype to PostgreSQL type"""
        type_mapping = {
            'int64': 'BIGINT',
            'float64': 'FLOAT',
            'object': 'TEXT',
            'datetime64[ns]': 'TIMESTAMP',
            'bool': 'BOOLEAN'
        }
        return type_mapping.get(str(dtype), 'TEXT')

    def generate_metadata_template(self, filename):
        """
        Generate a comprehensive metadata template for a file with intelligent defaults
        """
        try:
            dataset_path = os.path.join(self.pending_dir, filename)
            
            # Determine file type and load data
            if filename.endswith('.csv'):
                df = pd.read_csv(dataset_path)
                file_format = 'csv'
            elif filename.endswith('.xlsx'):
                df = pd.read_excel(dataset_path)
                file_format = 'xlsx'
            else:
                self.logger.error(f"Unsupported file type: {filename}")
                return None

            # Generate a unique dataset name
            dataset_name = f"dataset_{os.path.splitext(filename)[0]}_{datetime.now().strftime('%Y%m%d')}"

            # Intelligent defaults for dataset and resource
            template = {
                "dataset": {
                    "name": dataset_name,  # Unique dataset identifier
                    "title": f"Dataset from {filename}",
                    "notes": f"Automatically generated dataset from {filename} on {datetime.now().isoformat()}",
                    "owner_org": self.config.get('CKAN', 'owner_org', fallback=''),
                    "tags": [file_format],
                    "extras": {
                        "source": "Automated Import",
                        "import_date": datetime.now().isoformat(),
                        "file_origin": filename
                    }
                },
                "resource": {
                    "package_id": "",  # Will be filled during upload if adding to existing dataset
                    "name": filename,
                    "description": f"Resource imported from {filename}",
                    "format": file_format.upper(),
                    "mimetype": "text/csv" if file_format == 'csv' else "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    "url_type": "upload",
                    "resource_type": "file.upload",
                    "schema": {
                        "fields": [
                            {
                                "id": self._sanitize_column_name(col),
                                "name": col,  # Original column name
                                "type": str(df[col].dtype),
                                "description": "[Enter column description]",
                                "sample_values": df[col].sample(min(3, len(df))).tolist()
                            } for col in df.columns
                        ]
                    }
                }
            }

            # Save template
            template_filename = f"metadata_{os.path.splitext(filename)[0]}_{datetime.now().strftime('%Y%m%d')}.json"
            template_path = os.path.join(self.metadata_dir, template_filename)
            
            with open(template_path, 'w', encoding='utf-8') as json_file:
                json.dump(template, json_file, indent=4)
            
            self.logger.info(f"Metadata template generated: {template_filename}")
            return template_path

        except Exception as e:
            self.logger.error(f"Error generating metadata template for {filename}: {e}")
            return None

    def load_metadata_template(self, template_path):
        """
        Load and validate a metadata template
        """
        try:
            with open(template_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            self.logger.error(f"Error loading metadata template: {e}")
            return None

    def push_to_datastore(self, resource_id, df):
        """Push data to CKAN datastore"""
        try:
            # Prepare the data for datastore
            records = df.to_dict('records')
            fields = [
                {
                    'id': self._sanitize_column_name(col), 
                    'type': str(df[col].dtype)
                } for col in df.columns
            ]

            datastore_payload = {
                'resource_id': resource_id,
                'fields': fields,
                'records': records,
                'primary_key': fields[0]['id'] if fields else None
            }

            datastore_response = requests.post(
                f"{self.ckan_api_url}/datastore_create",
                json=datastore_payload,
                headers={
                    'Authorization': self.ckan_api_key,
                    'Content-Type': 'application/json'
                }
            )
            
            if datastore_response.status_code == 200:
                self.logger.info(f"Successfully pushed data to datastore for resource {resource_id}")
                return True
            else:
                self.logger.error(f"Failed to push to datastore: {datastore_response.text}")
                return False
        except Exception as e:
            self.logger.error(f"Error pushing to datastore: {str(e)}")
            return False

    def process_file(self, filename, metadata_template=None):
        """
        Process a single file: load to database, register in CKAN, move file
        Supports optional metadata template
        """
        try:
            # Full file path
            file_path = os.path.join(self.pending_dir, filename)
            
            # Read file (supports multiple formats)
            if filename.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif filename.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                self.logger.error(f"Unsupported file type: {filename}")
                return False

            # If no metadata template provided, generate one
            if metadata_template is None:
                metadata_template_path = self.generate_metadata_template(filename)
                if metadata_template_path:
                    metadata_template = self.load_metadata_template(metadata_template_path)
                else:
                    self.logger.error(f"Could not generate metadata for {filename}")
                    return False

            # Generate table name (use filename without extension)
            table_name = f"datastore_{self._sanitize_column_name(os.path.splitext(filename)[0])}"
            
            # Database operations
            with psycopg2.connect(**self.db_params) as conn:
                with conn.cursor() as cur:
                    # Create table with inferred schema
                    columns = [
                        f"{self._sanitize_column_name(col)} {self._infer_postgres_type(df[col].dtype)}"
                        for col in df.columns
                    ]
                    create_table_sql = f"""
                    CREATE TABLE IF NOT EXISTS {table_name} (
                        {', '.join(columns)}
                    )
                    """
                    cur.execute(create_table_sql)
                    
                    # Prepare and execute bulk insert
                    from io import StringIO
                    output = StringIO()
                    df.to_csv(output, sep='\t', header=False, index=False)
                    output.seek(0)
                    cur.copy_from(output, table_name, null="")
                    
                    conn.commit()
                    self.logger.info(f"Successfully loaded {filename} to {table_name}")

            # CKAN Dataset and Resource Registration
            dataset_metadata = metadata_template.get('dataset', {})
            resource_metadata = metadata_template.get('resource', {})

            try:
                # Create or update dataset
                dataset_payload = {
                    'name': dataset_metadata.get('name'),
                    'title': dataset_metadata.get('title'),
                    'notes': dataset_metadata.get('notes'),
                    'owner_org': dataset_metadata.get('owner_org'),
                    'extras': dataset_metadata.get('extras', []),
                    'tags': [{'name': tag} for tag in dataset_metadata.get('tags', [])]
                }

                # Check if dataset already exists
                existing_dataset_response = requests.get(
                    f"{self.ckan_api_url}/package_show",
                    params={'id': dataset_metadata.get('name')},
                    headers={'Authorization': self.ckan_api_key}
                )

                if existing_dataset_response.status_code == 200:
                    # Update existing dataset
                    dataset_payload['id'] = existing_dataset_response.json()['result']['id']
                    create_dataset_response = requests.post(
                        f"{self.ckan_api_url}/package_update", 
                        json=dataset_payload,
                        headers={
                            'Authorization': self.ckan_api_key,
                            'Content-Type': 'application/json'
                        }
                    )
                else:
                    # Create new dataset
                    create_dataset_response = requests.post(
                        f"{self.ckan_api_url}/package_create", 
                        json=dataset_payload,
                        headers={
                            'Authorization': self.ckan_api_key,
                            'Content-Type': 'application/json'
                        }
                    )

                create_dataset_response.raise_for_status()
                dataset_result = create_dataset_response.json()
                dataset_id = dataset_result['result']['id']

                # Create resource with file upload
                with open(file_path, 'rb') as f:
                    resource_payload = {
                        'package_id': dataset_id,
                        'name': resource_metadata.get('name', filename),
                        'description': resource_metadata.get('description', f'Imported from {filename}'),
                        'resource_type': resource_metadata.get('resource_type', 'file.upload'),
                        'url_type': 'upload',
                        'format': resource_metadata.get('format', os.path.splitext(filename)[1][1:].upper()),
                        'mimetype': resource_metadata.get('mimetype', 'text/csv')
                    }
                    
                    create_resource_response = requests.post(
                        f"{self.ckan_api_url}/resource_create", 
                        data=resource_payload,
                        files={'upload': (filename, f)},
                        headers={'Authorization': self.ckan_api_key}
                    )
                
                create_resource_response.raise_for_status()
                resource_result = create_resource_response.json()
                resource_id = resource_result['result']['id']

                # Push data to datastore
                self.push_to_datastore(resource_id, df)

                # Move file to completed directory
                completed_path = os.path.join(self.completed_dir, filename)
                shutil.move(file_path, completed_path)
                self.logger.info(f"Moved {filename} to {completed_path}")

                return True

            except requests.exceptions.RequestException as e:
                self.logger.error(f"API request failed: {e}")
                if hasattr(e, 'response'):
                    self.logger.error(f"Response content: {e.response.text}")
                return False
            except Exception as e:
                self.logger.error(f"Unexpected error during CKAN registration: {str(e)}")
                return False

        except Exception as e:
            self.logger.error(f"An error occurred while processing {filename}: {str(e)}")
            return False

    def run(self, use_metadata_templates=True):
        """
        Main processing loop with optional metadata template support
        """
        self.logger.info("Starting file processing...")
        files = [f for f in os.listdir(self.pending_dir) 
                 if os.path.isfile(os.path.join(self.pending_dir, f)) 
                 and f.endswith(('.csv', '.xlsx'))]
        
        self.logger.info(f"Found {len(files)} files to process")
        
        for filename in files:
            try:
                # Look for corresponding metadata template
                metadata_template = None
                if use_metadata_templates:
                    metadata_filename = f"metadata_{os.path.splitext(filename)[0]}_*.json"
                    metadata_matches = [
                        f for f in os.listdir(self.metadata_dir) 
                        if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
                    ]
                    
                    if metadata_matches:
                        latest_metadata = max(metadata_matches)
                        metadata_path = os.path.join(self.metadata_dir, latest_metadata)
                        metadata_template = self.load_metadata_template(metadata_path)

                self.logger.info(f"Processing file: {filename}")
                if self.process_file(filename, metadata_template):
                    self.logger.info(f"Successfully processed {filename}")
                else:
                    self.logger.error(f"Failed to process {filename}")
            
            except Exception as e:
                self.logger.error(f"An unexpected error occurred while processing {filename}: {str(e)}")

if __name__ == '__main__':
    # You might want to add command-line argument parsing here
    loader = CKANDatastoreLoader()
    loader.run(use_metadata_templates=True)

2025-01-31 14:09:48,178 - INFO: Starting file processing...
2025-01-31 14:09:48,179 - INFO: Found 1 files to process
2025-01-31 14:09:48,179 - INFO: Processing file: GDP-Test zz4.csv
2025-01-31 14:09:48,512 - INFO: Successfully loaded GDP-Test zz4.csv to datastore_gdp_test_zz4
2025-01-31 14:09:48,512 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 14:09:49,290 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/package_create HTTP/11" 200 1216
2025-01-31 14:09:49,307 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 14:09:50,010 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/resource_create HTTP/11" 200 848
2025-01-31 14:09:50,045 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 14:09:50,262 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/datastore_create HTTP/11" 409 332
2025-01-31 14:09:50,268 - INFO: Moved GDP-Test zz4.csv to ./Completed\GDP-Test zz4.csv
2025-01-31 14:09:50,268 - INFO: Successf