In [1]:
import os
import pandas as pd
import chardet
import logging
import json
from typing import Tuple, Optional, Dict
from Functions.utils import ConfigLoader, sanitize_name, setup_logging
from Functions.ckan_manager import CKANManager
from typing import List, Dict, Optional, Any  # Add this import
import csv
import psycopg2

class FileProcessor:
    def __init__(self):
        self.config = ConfigLoader()
        self.ckan = CKANManager(
            self.config.ckan_api_url,
            self.config.ckan_api_key
        )
        self.logger = setup_logging()

    def process_files(self) -> None:
        """Main processing loop with error containment."""
        files = self._get_pending_files()
        
        for filename in files:
            try:
                success, message = self._process_single_file(filename)
                self.ckan.add_to_report(filename, success, message)
            except Exception as e:
                self.logger.error(f"Critical failure processing {filename}: {str(e)}")
                self.ckan.add_to_report(filename, False, "Critical error")

        self.ckan.generate_report(self.config.completed_report_dir)

    def _process_single_file(self, filename: str) -> Tuple[bool, str]:
        """Process individual file with atomic operations."""
        file_path = os.path.join(self.config.pending_file_dir, filename)
        
        try:
            # Step 1: Load data
            df = self._load_data(file_path)  # Only pass file_path
            
            # Step 2: Prepare metadata
            metadata = self._load_metadata(filename)
            self._validate_metadata(metadata)
            
            # Step 3: Database operations
            table_name = self._sanitize_table_name(filename)
            self._load_to_postgres(table_name, df)
            
            # Step 4: CKAN operations
            dataset_id = self.ckan.create_update_dataset(metadata['dataset'])
            if not dataset_id:
                return False, "Dataset creation failed"
            
            resource_success = self.ckan.manage_resource(
                dataset_id,
                file_path,
                metadata['resource']
            )
            
            if not resource_success:
                return False, "Resource creation failed"
            
            # Step 5: Finalize
            self._move_processed_files(filename)
            return True, "Success"
                
        except Exception as e:
            self.logger.error(f"Processing failed: {str(e)}")
            return False, str(e)

    def _detect_encoding(self, file_path: str) -> str:
        """Advanced encoding detection."""
        with open(file_path, 'rb') as f:
            raw = f.read(10000)  # Sample for detection
            result = chardet.detect(raw)
        
        encoding = result['encoding'] or 'utf-8'
        # Handle common misdetections
        if encoding.lower() in ['windows-1252', 'iso-8859-1']:
            return 'cp1252'
        return encoding

    def _load_data(self, file_path: str) -> pd.DataFrame:
        """Robust CSV file loading with encoding detection and fallback."""
        # List of encodings to try (in order of likelihood)
        encodings_to_try = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
        
        # Try each encoding until one works
        for encoding in encodings_to_try:
            try:
                self.logger.info(f"Attempting to load {file_path} with encoding: {encoding}")
                df = pd.read_csv(
                    file_path,
                    encoding=encoding,
                    on_bad_lines='warn',  # Skip bad lines instead of failing
                    dtype='string'         # Use string dtype to avoid type inference issues
                )
                self.logger.info(f"Successfully loaded {file_path} with encoding: {encoding}")
                return df
            except UnicodeDecodeError:
                self.logger.warning(f"Failed to load {file_path} with encoding: {encoding}")
                continue
            except Exception as e:
                self.logger.error(f"Unexpected error loading {file_path}: {str(e)}")
                raise ValueError(f"Failed to load file: {str(e)}")
        
        # If no encoding works, raise an error
        raise ValueError(
            f"Failed to load {file_path} with encodings: {', '.join(encodings_to_try)}"
        )

    def _validate_metadata(self, metadata: Dict) -> None:
        """Ensure metadata meets CKAN requirements."""
        required = ['owner_org', 'title']
        missing = [field for field in required if not metadata['dataset'].get(field)]
        if missing:
            raise ValueError(f"Missing required fields: {missing}")

        # Auto-fill common issues
        if not metadata['dataset'].get('license_id'):
            metadata['dataset']['license_id'] = 'uk-ogl'

    def _sanitize_table_name(self, filename: str) -> str:
        """Generate a safe table name from filename."""
        base_name = os.path.splitext(filename)[0]
        return f"datastore_{sanitize_name(base_name)}"

    def _load_to_postgres(self, table_name: str, df: pd.DataFrame) -> None:
        """Load data into PostgreSQL."""
        try:
            with psycopg2.connect(**self.config.db_params) as conn:
                with conn.cursor() as cur:
                    # Create table
                    columns = [
                        f"{sanitize_name(col)} {self._infer_postgres_type(df[col].dtype)}"
                        for col in df.columns
                    ]
                    create_sql = f"""
                    CREATE TABLE IF NOT EXISTS {table_name} (
                        {', '.join(columns)}
                    )
                    """
                    cur.execute(create_sql)
                    
                    # Insert data
                    from io import StringIO
                    output = StringIO()
                    df.to_csv(output, sep='\t', header=False, index=False)
                    output.seek(0)
                    cur.copy_from(output, table_name, null="")
                    conn.commit()
        except Exception as e:
            raise ValueError(f"PostgreSQL load failed: {str(e)}")

    def _infer_postgres_type(self, dtype) -> str:
        """Map pandas types to PostgreSQL types."""
        type_mapping = {
            'int64': 'bigint',
            'float64': 'double precision',
            'object': 'text',
            'datetime64[ns]': 'timestamp',
            'bool': 'boolean'
        }
        return type_mapping.get(str(dtype), 'text')

    def _load_metadata(self, filename: str) -> Dict:
        """Load and validate metadata file."""
        metadata_matches = [
            f for f in os.listdir(self.config.pending_metadata_dir)
            if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
        ]
        if not metadata_matches:
            raise ValueError("No matching metadata file found")
        
        latest_metadata = max(metadata_matches)
        metadata_path = os.path.join(self.config.pending_metadata_dir, latest_metadata)
        
        try:
            with open(metadata_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            raise ValueError(f"Metadata load failed: {str(e)}")

    def _move_processed_files(self, filename: str) -> None:
        """Move processed files to completed directories."""
        try:
            # Move data file
            src_file = os.path.join(self.config.pending_file_dir, filename)
            dest_file = os.path.join(self.config.completed_file_dir, filename)
            os.rename(src_file, dest_file)
            
            # Move metadata file
            metadata_matches = [
                f for f in os.listdir(self.config.pending_metadata_dir)
                if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
            ]
            if metadata_matches:
                latest_metadata = max(metadata_matches)
                src_meta = os.path.join(self.config.pending_metadata_dir, latest_metadata)
                dest_meta = os.path.join(self.config.completed_metadata_dir, latest_metadata)
                os.rename(src_meta, dest_meta)
        except Exception as e:
            raise ValueError(f"File move failed: {str(e)}")

    def _get_pending_files(self) -> List[str]:
        """Get list of files to process."""
        return [
            f for f in os.listdir(self.config.pending_file_dir)
            if os.path.isfile(os.path.join(self.config.pending_file_dir, f))
            and f.endswith((".csv", ".xlsx"))
        ]

if __name__ == "__main__":
    processor = FileProcessor()
    processor.process_files()

2025-02-12 20:38:20,606 - ckan_loader - INFO - Attempting to load ./Pending\files\Bikeability Scotland - Schools delivering Level 1 and_or Level 2 - 2019_20.csv with encoding: utf-8
2025-02-12 20:38:20,606 - ckan_loader - INFO - Attempting to load ./Pending\files\Bikeability Scotland - Schools delivering Level 1 and_or Level 2 - 2019_20.csv with encoding: utf-8
2025-02-12 20:38:20,617 - ckan_loader - INFO - Successfully loaded ./Pending\files\Bikeability Scotland - Schools delivering Level 1 and_or Level 2 - 2019_20.csv with encoding: utf-8
2025-02-12 20:38:20,617 - ckan_loader - INFO - Successfully loaded ./Pending\files\Bikeability Scotland - Schools delivering Level 1 and_or Level 2 - 2019_20.csv with encoding: utf-8
2025-02-12 20:38:20,618 - ckan_loader - ERROR - Processing failed: PostgreSQL load failed: name 'psycopg2' is not defined
2025-02-12 20:38:20,618 - ckan_loader - ERROR - Processing failed: PostgreSQL load failed: name 'psycopg2' is not defined
2025-02-12 20:38:20,618 - 