In [26]:
import os
import pandas as pd
import psycopg2
import requests
import shutil
import logging
from datetime import datetime
import configparser
import json

class CKANDatastoreLoader:
    def __init__(self, config_path='config.ini'):
        # Load configuration
        self.config = configparser.ConfigParser()
        self.config.read(config_path)
        
        # Setup logging with UTF-8 encoding to handle special characters
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s: %(message)s',
            handlers=[
                logging.FileHandler('datastore_loader.log', encoding='utf-8'),
                logging.StreamHandler()  # Also print to console
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Database connection parameters
        self.db_params = {
            'dbname': self.config.get('Database', 'dbname'),
            'user': self.config.get('Database', 'user'),
            'password': self.config.get('Database', 'password'),
            'host': self.config.get('Database', 'host')
        }
        
        # CKAN API parameters
        self.ckan_api_url = self.config.get('CKAN', 'api_url')
        self.ckan_api_key = self.config.get('CKAN', 'api_key')
        
        # Paths
        self.pending_dir = self.config.get('Paths', 'pending_dir')
        self.completed_dir = self.config.get('Paths', 'completed_dir')
        
        # Create directories if they don't exist
        os.makedirs(self.pending_dir, exist_ok=True)
        os.makedirs(self.completed_dir, exist_ok=True)

    def _sanitize_column_name(self, name):
        """Sanitize column names for PostgreSQL and CKAN"""
        return ''.join(c if c.isalnum() or c == '_' else '_' for c in name).lower()

    def _infer_postgres_type(self, dtype):
        """Convert pandas dtype to PostgreSQL type"""
        type_mapping = {
            'int64': 'BIGINT',
            'float64': 'FLOAT',
            'object': 'TEXT',
            'datetime64[ns]': 'TIMESTAMP',
            'bool': 'BOOLEAN'
        }
        return type_mapping.get(str(dtype), 'TEXT')

    def push_to_datastore(self, resource_id, df):
        """Push data to CKAN datastore"""
        try:
            # Prepare the data for datastore
            records = df.to_dict('records')
            fields = [
                {'id': self._sanitize_column_name(col), 'type': str(df[col].dtype)} 
                for col in df.columns
            ]

            datastore_payload = {
                'resource_id': resource_id,
                'fields': fields,
                'records': records,
                'primary_key': fields[0]['id'] if fields else None
            }

            datastore_response = requests.post(
                f"{self.ckan_api_url}/datastore_create",
                json=datastore_payload,
                headers={
                    'Authorization': self.ckan_api_key,
                    'Content-Type': 'application/json'
                }
            )
            
            if datastore_response.status_code == 200:
                self.logger.info(f"Successfully pushed data to datastore for resource {resource_id}")
                return True
            else:
                self.logger.error(f"Failed to push to datastore: {datastore_response.text}")
                return False
        except Exception as e:
            self.logger.error(f"Error pushing to datastore: {str(e)}")
            return False

    def process_file(self, filename):
        """
        Process a single file: load to database, register in CKAN, move file
        """
        try:
            # Full file path
            file_path = os.path.join(self.pending_dir, filename)
            
            # Read file (supports multiple formats)
            if filename.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif filename.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                self.logger.error(f"Unsupported file type: {filename}")
                return False

            # Generate table name (use filename without extension)
            table_name = f"datastore_{self._sanitize_column_name(os.path.splitext(filename)[0])}"
            
            # Database operations
            with psycopg2.connect(**self.db_params) as conn:
                with conn.cursor() as cur:
                    # Create table with inferred schema
                    columns = [
                        f"{self._sanitize_column_name(col)} {self._infer_postgres_type(df[col].dtype)}"
                        for col in df.columns
                    ]
                    create_table_sql = f"""
                    CREATE TABLE IF NOT EXISTS {table_name} (
                        {', '.join(columns)}
                    )
                    """
                    cur.execute(create_table_sql)
                    
                    # Prepare and execute bulk insert
                    from io import StringIO
                    output = StringIO()
                    df.to_csv(output, sep='\t', header=False, index=False)
                    output.seek(0)
                    cur.copy_from(output, table_name, null="")
                    
                    conn.commit()
                    self.logger.info(f"Successfully loaded {filename} to {table_name}")

            # CKAN API Registration
            dataset_metadata = {
                'name': table_name,
                'title': f"Dataset from {filename}",
                'notes': f"Automatically imported on {datetime.now().isoformat()}",
                'owner_org': self.config.get('CKAN', 'owner_org')
            }

            # Create dataset
            try:
                create_dataset_response = requests.post(
                    f"{self.ckan_api_url}/package_create", 
                    json=dataset_metadata,
                    headers={
                        'Authorization': self.ckan_api_key,
                        'Content-Type': 'application/json'
                    }
                )
                
                # Add explicit error checking
                create_dataset_response.raise_for_status()
                dataset_result = create_dataset_response.json()
                
                # Explicit check for dataset ID
                if 'result' not in dataset_result or 'id' not in dataset_result['result']:
                    self.logger.error(f"No dataset ID returned: {json.dumps(dataset_result)}")
                    return False
                
                dataset_id = dataset_result['result']['id']

                # Create resource with file upload
                with open(file_path, 'rb') as f:
                    resource_metadata = {
                        'package_id': dataset_id,
                        'name': filename,
                        'description': f'Automatically imported from {filename}',
                        'resource_type': 'file.upload',
                        'url_type': 'upload',
                        'format': os.path.splitext(filename)[1][1:].upper(),
                        'mimetype': 'text/csv'
                    }
                    
                    create_resource_response = requests.post(
                        f"{self.ckan_api_url}/resource_create", 
                        data=resource_metadata,
                        files={'upload': (filename, f)},
                        headers={'Authorization': self.ckan_api_key}
                    )
                
                # Add explicit error checking for resource creation
                create_resource_response.raise_for_status()
                resource_result = create_resource_response.json()
                
                # Get the resource ID and push to datastore
                resource_id = resource_result['result']['id']
                self.push_to_datastore(resource_id, df)

                # Move file to completed directory
                completed_path = os.path.join(self.completed_dir, filename)
                shutil.move(file_path, completed_path)
                self.logger.info(f"Moved {filename} to {completed_path}")

                return True

            except requests.exceptions.RequestException as e:
                self.logger.error(f"API request failed: {e}")
                # Print full response content if available
                if hasattr(e, 'response'):
                    self.logger.error(f"Response content: {e.response.text}")
                return False
            except Exception as e:
                self.logger.error(f"Unexpected error during CKAN registration: {str(e)}")
                return False

        except Exception as e:
            self.logger.error(f"An error occurred while processing {filename}: {str(e)}")
            return False

    def run(self):
        """Main processing loop"""
        self.logger.info("Starting file processing...")
        files = [f for f in os.listdir(self.pending_dir) if os.path.isfile(os.path.join(self.pending_dir, f))]
        self.logger.info(f"Found {len(files)} files to process")
        
        for filename in files:
            try:
                self.logger.info(f"Processing file: {filename}")
                if self.process_file(filename):
                    self.logger.info(f"Successfully processed {filename}")
                else:
                    self.logger.error(f"Failed to process {filename}")
            except Exception as e:
                self.logger.error(f"An error occurred while processing {filename}: {str(e)}")

if __name__ == '__main__':
    loader = CKANDatastoreLoader()
    loader.run()

2025-01-31 13:49:12,901 - INFO: Starting file processing...
2025-01-31 13:49:12,902 - INFO: Found 1 files to process
2025-01-31 13:49:12,903 - INFO: Processing file: GDP-Test.csv
2025-01-31 13:49:13,227 - INFO: Successfully loaded GDP-Test.csv to datastore_gdp_test
2025-01-31 13:49:13,227 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 13:49:14,542 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/package_create HTTP/11" 200 1208
2025-01-31 13:49:14,556 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 13:49:15,060 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/resource_create HTTP/11" 200 690
2025-01-31 13:49:15,091 - DEBUG: Starting new HTTP connection (1): 35.177.24.156:5000
2025-01-31 13:49:15,308 - DEBUG: http://35.177.24.156:5000 "POST /api/3/action/datastore_create HTTP/11" 409 332
2025-01-31 13:49:15,310 - INFO: Moved GDP-Test.csv to ./Completed\GDP-Test.csv
2025-01-31 13:49:15,312 - INFO: Successfully processed GDP-T