In [1]:
import os
import pandas as pd
import chardet
import logging
import json
from typing import Tuple, Optional, Dict, List
from Functions.utils import ConfigLoader, sanitize_name, setup_logging, load_metadata
from Functions.ckan_manager import CKANManager
import csv
import shutil

class FileProcessor:
    def __init__(self):
        self.config = ConfigLoader()
        self.ckan = CKANManager(
            self.config.ckan_api_url,
            self.config.ckan_api_key
        )
        self.logger = setup_logging()

    def _detect_encoding(self, file_path: str) -> str:
        """Detect file encoding using chardet."""
        try:
            with open(file_path, 'rb') as file:
                raw_data = file.read(100000)  # Read first 100KB for detection
                result = chardet.detect(raw_data)
                encoding = result['encoding']
                confidence = result['confidence']
                
                self.logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2f}) for {file_path}")
                
                # If confidence is low, try common encodings
                if confidence < 0.7:
                    self.logger.warning(f"Low confidence in encoding detection. Trying fallback encodings.")
                    return self._try_fallback_encodings(file_path)
                
                return encoding
        except Exception as e:
            self.logger.error(f"Error detecting encoding for {file_path}: {str(e)}")
            return self._try_fallback_encodings(file_path)

    def _try_fallback_encodings(self, file_path: str) -> str:
        """Try common encodings when detection fails or has low confidence."""
        encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
        
        for encoding in encodings_to_try:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    file.read(1000)  # Try to read first 1000 characters
                self.logger.info(f"Successfully validated encoding: {encoding} for {file_path}")
                return encoding
            except UnicodeDecodeError:
                continue
        
        # If all else fails, use latin-1 (it can decode any byte sequence)
        self.logger.warning(f"Using latin-1 as fallback encoding for {file_path}")
        return 'latin-1'

    def _read_csv_with_encoding(self, file_path: str) -> pd.DataFrame:
        """Read CSV file with proper encoding detection and handling."""
        encoding = self._detect_encoding(file_path)
        
        try:
            # Try reading with detected encoding
            df = pd.read_csv(file_path, encoding=encoding)
            self.logger.info(f"Successfully read CSV with {encoding} encoding")
            return df
        except UnicodeDecodeError as e:
            self.logger.warning(f"Failed to read with {encoding}, trying fallback methods: {str(e)}")
            
            # Try with error handling
            for error_handling in ['ignore', 'replace']:
                try:
                    df = pd.read_csv(file_path, encoding=encoding, encoding_errors=error_handling)
                    self.logger.info(f"Successfully read CSV with {encoding} encoding and {error_handling} error handling")
                    return df
                except Exception as e:
                    continue
            
            # Last resort: try latin-1
            try:
                df = pd.read_csv(file_path, encoding='latin-1')
                self.logger.info("Successfully read CSV with latin-1 encoding as last resort")
                return df
            except Exception as e:
                raise Exception(f"Failed to read CSV with any encoding method: {str(e)}")

    def process_files(self) -> None:
        """Main processing loop with error containment."""
        files = self._get_pending_files()

        for filename in files:
            try:
                success, message = self._process_single_file(filename)
                self.ckan.add_to_report(filename, success, message)
            except Exception as e:
                self.logger.error(f"Critical failure processing {filename}: {str(e)}")
                self.ckan.add_to_report(filename, False, "Critical error")

        self.ckan.generate_report(self.config.completed_report_dir)

    def _process_single_file(self, filename: str) -> Tuple[bool, str]:
        """Process individual file with atomic operations."""
        file_path = os.path.join(self.config.pending_file_dir, filename)
        completed_file_path = os.path.join(self.config.completed_file_dir, filename)

        try:
            # Step 1: Load data with proper encoding handling
            if filename.endswith('.csv'):
                df = self._read_csv_with_encoding(file_path)
            elif filename.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                self.logger.error(f"Unsupported file type: {filename}")
                return False, "Unsupported file type"

            # Step 2: Load metadata
            metadata_template = load_metadata(self, filename)

            # Step 3: Prepare dataset payload
            dataset_payload = metadata_template.get("dataset", {})
            if not dataset_payload.get("name"):
                dataset_payload["name"] = sanitize_name(os.path.splitext(filename)[0])
            if not dataset_payload.get("title"):
                dataset_payload["title"] = os.path.splitext(filename)[0]

            # Step 4: Create or update dataset
            dataset_id = self.ckan.create_or_update_dataset(dataset_payload)
            if not dataset_id:
                return False, "Failed to create/update dataset"

            # Step 5: Prepare resource payload
            resource_payload = metadata_template.get("resource", {})
            resource_payload["name"] = filename

            # Step 6: Create or update resource
            if not self.ckan.create_or_update_resource(dataset_id, resource_payload, file_path):
                return False, "Failed to create/update resource"

            # Step 7: Move processed file to completed directory
            shutil.move(file_path, completed_file_path)

            # Step 8: Move metadata file if it exists
            if metadata_template:
                metadata_matches = [
                    f for f in os.listdir(self.config.pending_metadata_dir)
                    if f.startswith(f"metadata_{os.path.splitext(filename)[0]}")
                ]
                if metadata_matches:
                    latest_metadata = max(metadata_matches)
                    metadata_path = os.path.join(self.config.pending_metadata_dir, latest_metadata)
                    completed_metadata_path = os.path.join(self.config.completed_metadata_dir, latest_metadata)
                    shutil.move(metadata_path, completed_metadata_path)
                    self.logger.info(f"Moved metadata file {latest_metadata} to {completed_metadata_path}")
                else:
                    self.logger.warning(f"No metadata file found for {filename}")
            else:
                self.logger.warning("No metadata template provided. Skipping metadata file movement.")

            self.logger.info(f"Successfully processed {filename}")
            return True, "Success"
        except Exception as e:
            error_msg = f"Unexpected error: {str(e)}"
            self.logger.error(f"{error_msg} while processing {filename}")
            return False, error_msg

    def _get_pending_files(self) -> List[str]:
        """Get list of files to process."""
        return [
            f for f in os.listdir(self.config.pending_file_dir)
            if os.path.isfile(os.path.join(self.config.pending_file_dir, f))
            and f.endswith((".csv", ".xlsx"))
        ]

def run():
    """Run the loader to process all pending files."""
    logger = setup_logging()
    config = ConfigLoader()
    ckan_manager = CKANManager(config.ckan_api_url, config.ckan_api_key)

    logger.info("Starting file processing...")
    files = [
        f for f in os.listdir(config.pending_file_dir)
        if os.path.isfile(os.path.join(config.pending_file_dir, f)) and f.endswith((".csv", ".xlsx"))
    ]
    logger.info(f"Found {len(files)} files to process")

    processor = FileProcessor()
    processor.process_files()

if __name__ == "__main__":
    run()

2025-05-29 10:00:30,482 - INFO - Starting file processing...
2025-05-29 10:00:30,482 - INFO - Starting file processing...
2025-05-29 10:00:30,484 - INFO - Found 1 files to process
2025-05-29 10:00:30,484 - INFO - Found 1 files to process
2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\files\Data Zone Lookup - Archived Geographies.csv
2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\files\Data Zone Lookup - Archived Geographies.csv
2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\files\Data Zone Lookup - Archived Geographies.csv
2025-05-29 10:00:30,506 - INFO - Detected encoding: ascii (confidence: 1.00) for ./Pending\files\Data Zone Lookup - Archived Geographies.csv
2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handling
2025-05-29 10:00:30,627 - INFO - Successfully read CSV with ascii encoding and ignore error handl