In [1]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [8]:
import os
import pandas as pd
import numpy as np
import logging
import traceback
from typing import List, Dict, Tuple, Optional, Any, Union
from dataclasses import dataclass, field
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import multiprocessing
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("predictive_policing")

# Configuration class using dataclass for type hints and defaults
@dataclass
class Config:
    # Paths
    base_dir: str = "/content/drive/MyDrive/predictive_policing"
    data_dir: str = field(default="data")
    raw_dir: str = field(default="raw")
    processed_dir: str = field(default="processed")
    models_dir: str = field(default="models")
    results_dir: str = field(default="results")

    # Data files
    chicago_crime_file: str = "chicago_crime.csv"
    uci_communities_file: str = "uci_communities.csv"

    # Model parameters
    test_size: float = 0.2
    random_state: int = 42
    target_column: str = "crime_rate"

    # Fairness parameters
    # These are potential protected attributes - will be validated against available columns
    protected_attributes: List[str] = field(default_factory=lambda: [
        "racepctblack", "racePctWhite", "racePctAsian",
        "racePctHisp", "pctWPubAsst", "medIncome"
    ])
    # Alternative demographic proxies if primary attributes aren't available
    demographic_proxies: List[str] = field(default_factory=lambda: [
        "community_area", "community_name", "area_code", "zipcode"
    ])

    # Processing
    n_cores: int = field(default_factory=lambda: max(1, multiprocessing.cpu_count() - 1))

    def __post_init__(self):
        """Initialize derived paths after construction"""
        # Create full paths by joining with base_dir
        self.data_dir = os.path.join(self.base_dir, self.data_dir)
        self.raw_dir = os.path.join(self.data_dir, self.raw_dir)
        self.processed_dir = os.path.join(self.data_dir, self.processed_dir)
        self.models_dir = os.path.join(self.data_dir, self.models_dir)
        self.results_dir = os.path.join(self.data_dir, self.results_dir)

        # Make sure directories exist
        for directory in [self.data_dir, self.raw_dir, self.processed_dir,
                          self.models_dir, self.results_dir]:
            os.makedirs(directory, exist_ok=True)


class DataProcessor:
    """Handles data loading, cleaning, and preprocessing."""

    def __init__(self, config: Config):
        self.config = config
        self.logger = logging.getLogger(f"{__name__}.DataProcessor")

    def load_csv(self, filepath: str) -> pd.DataFrame:
        """Load a CSV file with error handling."""
        try:
            df = pd.read_csv(filepath)
            logger.info(f"Loaded {filepath} with shape {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error loading {filepath}: {e}")
            # Return an empty DataFrame with a default structure if file not found
            if isinstance(e, FileNotFoundError):
                logger.warning(f"Creating empty DataFrame as {filepath} was not found")
                return pd.DataFrame()
            raise

    def clean_chicago_crime(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and preprocess Chicago crime data."""
        logger.info(f"Cleaning Chicago crime data from {os.path.join(self.config.raw_dir, self.config.chicago_crime_file)}")

        # Check if DataFrame is empty
        if df.empty:
            logger.warning("Empty crime data, creating minimal synthetic data for testing")
            # Create minimal synthetic data
            synthetic_df = pd.DataFrame({
                'id': range(1000),
                'community_area': np.random.randint(1, 78, 1000),
                'primary_type': np.random.choice(['THEFT', 'BATTERY', 'ASSAULT'], 1000),
                'date': pd.date_range(start='2020-01-01', periods=1000)
            })
            return synthetic_df

        # Make a copy to avoid modifying the original
        df = df.copy()

        # Basic cleaning steps
        # 1. Drop duplicates
        df = df.drop_duplicates()

        # 2. Convert date columns if present
        date_columns = [col for col in df.columns if 'date' in col.lower()]
        for col in date_columns:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
            except:
                pass

        # 3. Handle missing values based on column type
        for col in df.columns:
            # Skip date columns already handled
            if col in date_columns:
                continue

            if df[col].dtype == 'object':
                # For categorical columns, fill with 'Unknown'
                df[col] = df[col].fillna('Unknown')
            else:
                # For numeric columns, fill with median
                df[col] = df[col].fillna(df[col].median() if not pd.isna(df[col].median()) else 0)

        logger.info(f"Cleaned Chicago data shape: {df.shape}")
        return df

    def clean_uci_communities(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean and preprocess UCI Communities data."""
        logger.info(f"Cleaning UCI Communities data from {os.path.join(self.config.raw_dir, self.config.uci_communities_file)}")

        # Check if DataFrame is empty
        if df.empty:
            logger.warning("Empty communities data, creating minimal synthetic data for testing")
            # Create minimal synthetic data
            synthetic_df = pd.DataFrame({
                'community_area': range(1, 101),
                'population': np.random.randint(5000, 100000, 100),
                'median_income': np.random.randint(30000, 100000, 100),
                'community_name': [f"Area_{i}" for i in range(1, 101)]
            })
            return synthetic_df

        # Make a copy to avoid modifying the original
        df = df.copy()

        # Basic cleaning steps
        # 1. Drop duplicates
        df = df.drop_duplicates()

        # 2. Handle missing values
        for col in df.columns:
            if df[col].dtype == 'object':
                # For categorical columns, fill with 'Unknown'
                df[col] = df[col].fillna('Unknown')
            else:
                # For numeric columns, fill with median
                df[col] = df[col].fillna(df[col].median() if not pd.isna(df[col].median()) else 0)

        logger.info(f"Cleaned UCI data shape: {df.shape}")
        return df

    def create_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create features from crime data for modeling."""
        logger.info("Creating features from crime data")

        # Check if DataFrame is empty
        if df.empty:
            logger.warning("Empty DataFrame, cannot create features")
            return df

        # Make a copy to avoid modifying the original
        df = df.copy()

        # Example feature engineering - adjust based on available columns
        # If date column exists, extract time-based features
        date_columns = [col for col in df.columns if 'date' in col.lower()]
        if date_columns:
            date_col = date_columns[0]
            if pd.api.types.is_datetime64_dtype(df[date_col]):
                df['month'] = df[date_col].dt.month
                df['day_of_week'] = df[date_col].dt.dayofweek
                if hasattr(df[date_col].dt, 'hour'):  # Check if time component exists
                    df['hour'] = df[date_col].dt.hour
                df['is_weekend'] = df[date_col].dt.dayofweek.isin([5, 6]).astype(int)

        # If crime type exists, create dummy variables
        if 'primary_type' in df.columns:
            # Get top 5 crime types for dummy variables
            top_crimes = df['primary_type'].value_counts().nlargest(5).index
            for crime in top_crimes:
                crime_col_name = f'is_{crime.lower().replace(" ", "_")}'
                # Ensure no name conflicts
                if crime_col_name not in df.columns:
                    df[crime_col_name] = (df['primary_type'] == crime).astype(int)

        logger.info(f"Created features, new shape: {df.shape}")
        return df

    def merge_datasets(self, crime_df: pd.DataFrame, communities_df: pd.DataFrame) -> pd.DataFrame:
        """Merge crime and communities datasets."""
        logger.info("Merging datasets")

        # Check if either DataFrame is empty
        if crime_df.empty:
            logger.warning("Crime DataFrame is empty, using communities data only")
            # Add a synthetic crime count
            if not communities_df.empty:
                communities_df['crime_count'] = np.random.randint(10, 1000, size=len(communities_df))
                communities_df['crime_rate'] = communities_df['crime_count'] / communities_df['crime_count'].max()
            return communities_df

        if communities_df.empty:
            logger.warning("Communities DataFrame is empty, using crime data only")
            # Aggregate crime data to provide a minimal merged dataset
            if 'community_area' in crime_df.columns:
                # Group by community area and count crimes
                crime_counts = crime_df.groupby('community_area').size().reset_index(name='crime_count')
                crime_counts['crime_rate'] = crime_counts['crime_count'] / crime_counts['crime_count'].max()
                return crime_counts
            return crime_df

        # Identify potential join columns
        possible_join_columns = [
            'community_area', 'community_id', 'area_code',
            'neighborhood', 'district', 'zip_code', 'community'
        ]

        # Find columns present in both datasets
        join_columns = [col for col in possible_join_columns
                        if col in crime_df.columns and col in communities_df.columns]

        # If no common columns found, try to create a join
        if not join_columns:
            logger.warning("No common join columns found. Creating synthetic join.")

            # Create a new unique ID for both datasets
            crime_df = crime_df.copy()
            communities_df = communities_df.copy()

            # If community_area exists in crime data but not communities data,
            # create it in communities data
            if 'community_area' in crime_df.columns and 'community_area' not in communities_df.columns:
                # Get unique community areas from crime data
                unique_areas = crime_df['community_area'].unique()

                # If communities DataFrame has fewer rows than unique areas, expand it
                if len(communities_df) < len(unique_areas):
                    # Duplicate rows to match number of unique areas
                    communities_df = pd.concat([communities_df] * (len(unique_areas) // len(communities_df) + 1))
                    communities_df = communities_df.iloc[:len(unique_areas)].reset_index(drop=True)

                # Assign community areas to communities DataFrame
                communities_df['community_area'] = unique_areas[:len(communities_df)]

                # Now merge on community_area
                merged_df = communities_df.merge(
                    crime_df.groupby('community_area').size().reset_index(name='crime_count'),
                    on='community_area',
                    how='left'
                )
                merged_df['crime_count'] = merged_df['crime_count'].fillna(0)

                # Calculate crime rate
                merged_df['crime_rate'] = merged_df['crime_count'] / merged_df['crime_count'].max()

                logger.info(f"Created synthetic join on community_area, merged shape: {merged_df.shape}")
                return merged_df

            # Otherwise, create a completely synthetic join
            logger.warning("Creating fully synthetic join - for development only")

            # Create a join key in both DataFrames
            n_communities = min(len(communities_df), 100)  # Limit to at most 100 communities
            crime_df['_join_key'] = np.random.randint(0, n_communities, size=len(crime_df))
            communities_df['_join_key'] = range(n_communities)
            communities_df = communities_df.iloc[:n_communities]

            # Group crime data by join key to get crime counts
            crime_counts = crime_df.groupby('_join_key').size().reset_index(name='crime_count')

            # Merge
            merged_df = communities_df.merge(crime_counts, on='_join_key', how='left')

            # Fill missing crime counts with 0
            merged_df['crime_count'] = merged_df['crime_count'].fillna(0)

            # Calculate crime rate
            merged_df['crime_rate'] = merged_df['crime_count'] / merged_df['crime_count'].max()

            # Remove join key
            merged_df = merged_df.drop('_join_key', axis=1)

            logger.info(f"Created fully synthetic join, merged shape: {merged_df.shape}")
            return merged_df

        # Use the first available join column
        join_col = join_columns[0]
        logger.info(f"Joining datasets on '{join_col}'")

        # Check if the join column has unique values in the communities DataFrame
        if not communities_df[join_col].is_unique:
            logger.warning(f"Join column '{join_col}' is not unique in communities data - this may cause data duplication")

        # Aggregate crime data by join column
        crime_grouped = crime_df.groupby(join_col).size().reset_index(name='crime_count')

        # Merge with communities data
        merged_df = communities_df.merge(crime_grouped, on=join_col, how='left')

        # Fill missing crime counts with 0
        merged_df['crime_count'] = merged_df['crime_count'].fillna(0)

        # Calculate crime rate
        if 'population' in merged_df.columns:
            # Ensure population is never 0 to avoid division by zero
            merged_df['population'] = merged_df['population'].replace(0, 1)
            merged_df['crime_rate'] = merged_df['crime_count'] / merged_df['population']
        else:
            # No population data, normalize by maximum crime count
            merged_df['crime_rate'] = merged_df['crime_count'] / merged_df['crime_count'].max()

        logger.info(f"Merged data shape: {merged_df.shape}")
        return merged_df

    def validate_protected_attributes(self, df: pd.DataFrame) -> List[str]:
        """Validate and find protected attributes that exist in the data."""
        # Check which configured protected attributes are in the dataframe
        available_protected = [attr for attr in self.config.protected_attributes
                              if attr in df.columns]

        # If none found, check for demographic proxies
        if not available_protected:
            available_protected = [attr for attr in self.config.demographic_proxies
                                  if attr in df.columns]

            if available_protected:
                logger.info(f"Using proxy demographic columns: {available_protected}")
            else:
                logger.warning("No protected attributes or proxies found in the data")
                # If no attributes found, try to find any numeric column that could serve as proxy
                numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
                if numeric_cols:
                    proxy_col = numeric_cols[0]
                    logger.warning(f"Using '{proxy_col}' as fallback demographic proxy")
                    available_protected = [proxy_col]
        else:
            logger.info(f"Using protected attributes: {available_protected}")

        return available_protected


class ModelTrainer:
    """Handles model training, evaluation, and fairness analysis."""

    def __init__(self, config: Config):
        self.config = config
        self.logger = logging.getLogger(f"{__name__}.ModelTrainer")

    def prepare_data(self, df: pd.DataFrame, target_col: str) -> Tuple[pd.DataFrame, pd.Series]:
        """Prepare features and target for modeling."""
        if df.empty:
            logger.error("Cannot prepare data - DataFrame is empty")
            raise ValueError("Empty DataFrame provided to prepare_data")

        # Create target if it doesn't exist
        if target_col not in df.columns:
            logger.warning(f"Target column '{target_col}' not found in data - creating synthetic target")
            df[target_col] = np.random.random(len(df))

        # Select features - exclude obvious non-predictive columns
        exclude_cols = ['id', 'case_number', 'date', 'block', 'iucr', 'community_name',
                       'latitude', 'longitude', 'beat', 'district', 'ward',
                       target_col]
        feature_cols = [col for col in df.columns if col not in exclude_cols]

        # Handle categorical columns if any
        categorical_cols = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_cols:
            logger.info(f"Dropping {len(categorical_cols)} categorical columns (would need one-hot encoding)")
            feature_cols = [col for col in feature_cols if col not in categorical_cols]
            if not feature_cols:
                # If all features were categorical, create simple numeric features
                logger.warning("No numeric features available - creating simple index feature")
                df['index_feature'] = range(len(df))
                feature_cols = ['index_feature']

        # Make sure we have some features
        if not feature_cols:
            logger.warning("No usable features found - creating simple index feature")
            df['index_feature'] = range(len(df))
            feature_cols = ['index_feature']

        # Extract features and target
        X = df[feature_cols]
        y = df[target_col]

        logger.info(f"Prepared data with {X.shape[1]} features and {y.shape[0]} samples")
        return X, y

    def train_model(self, X: pd.DataFrame, y: pd.Series, model_type: str = 'xgboost') -> Dict:
        """Train a predictive model."""
        if X.empty or len(y) == 0:
            logger.error("Cannot train model - empty feature matrix or target vector")
            raise ValueError("Empty data provided to train_model")

        logger.info(f"Training {model_type} model")

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.config.test_size, random_state=self.config.random_state
        )

        # Normalize features - handle edge cases and errors
        try:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
        except Exception as e:
            logger.warning(f"Error scaling features: {e} - using unscaled features")
            X_train_scaled = X_train.to_numpy()
            X_test_scaled = X_test.to_numpy()

        # Train model with error handling
        try:
            if model_type.lower() == 'xgboost':
                model = xgb.XGBRegressor(
                    n_estimators=100,
                    max_depth=5,
                    learning_rate=0.1,
                    random_state=self.config.random_state
                )
            else:
                # Default to Random Forest
                model = RandomForestRegressor(
                    n_estimators=100,
                    max_depth=10,
                    random_state=self.config.random_state
                )

            # Fit model
            model.fit(X_train_scaled, y_train)

            # Evaluate
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)

            logger.info(f"Model performance - RMSE: {rmse:.4f}, R²: {r2:.4f}")

        except Exception as e:
            logger.error(f"Error training model: {e}")
            logger.info("Switching to backup simple model")

            # Create a simple fallback model (mean predictor)
            class SimpleMeanPredictor:
                def __init__(self, mean_value):
                    self.mean_value = mean_value

                def predict(self, X):
                    return np.full(X.shape[0], self.mean_value)

                def get_feature_importances(self):
                    return np.ones(X.shape[1]) / X.shape[1]

            model = SimpleMeanPredictor(y_train.mean())
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = 0  # R² for mean predictor is 0

            logger.info(f"Fallback model performance - RMSE: {rmse:.4f}, R²: {r2:.4f}")

        # Attempt to get feature importances
        try:
            if hasattr(model, 'feature_importances_'):
                feature_importances = model.feature_importances_
            elif hasattr(model, 'get_feature_importances'):
                feature_importances = model.get_feature_importances()
            else:
                feature_importances = np.ones(X.shape[1]) / X.shape[1]  # Equal importance
        except:
            feature_importances = np.ones(X.shape[1]) / X.shape[1]  # Equal importance fallback

        # Return model and evaluation metrics
        return {
            'model': model,
            'scaler': scaler if 'scaler' in locals() else None,
            'metrics': {
                'mse': mse,
                'rmse': rmse,
                'r2': r2
            },
            'test_data': {
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred
            },
            'feature_names': X.columns.tolist(),
            'feature_importances': feature_importances
        }

    def analyze_fairness(self, model_results: Dict, protected_attributes: List[str],
                         df: pd.DataFrame) -> Dict:
        """Analyze model fairness across protected attributes."""
        fairness_results = {}

        # If no protected attributes or no model results, return empty results
        if not protected_attributes or not model_results:
            logger.warning("No protected attributes or model results for fairness analysis")
            return fairness_results

        # Get test data
        try:
            X_test = model_results['test_data']['X_test']
            y_test = model_results['test_data']['y_test']
            y_pred = model_results['test_data']['y_pred']
        except KeyError as e:
            logger.error(f"Missing test data for fairness analysis: {e}")
            return fairness_results

        # For each protected attribute
        for attr in protected_attributes:
            # Check if attribute is in test data
            if attr not in X_test.columns:
                logger.warning(f"Protected attribute '{attr}' not in test data")
                continue

            try:
                # Get attribute values
                attr_values = X_test[attr].copy()

                # Discretize continuous attributes for analysis
                if attr_values.dtype.kind in 'fc':  # float or complex
                    # Create categories (quintiles)
                    try:
                        attr_categories = pd.qcut(attr_values, q=5, duplicates='drop')
                    except ValueError:
                        # Fall back to fewer bins if data isn't diverse enough
                        unique_values = attr_values.unique()
                        if len(unique_values) < 5:
                            # Just use the unique values as categories
                            attr_categories = attr_values
                        else:
                            # Try quantiles with fewer bins
                            try:
                                attr_categories = pd.qcut(attr_values, q=3, duplicates='drop')
                            except:
                                # Last resort - just use the raw values
                                attr_categories = attr_values
                else:
                    # Use as-is for categorical
                    attr_categories = attr_values

                # Calculate error by group
                group_errors = {}
                overall_abs_error = np.abs(y_test - y_pred).mean()

                for category in attr_categories.unique():
                    # Get indices for this category
                    idx = attr_categories == category
                    if idx.sum() == 0:
                        continue

                    # Calculate error for this group
                    group_error = np.abs(y_test[idx] - y_pred[idx]).mean()
                    group_errors[str(category)] = group_error

                # Calculate disparity metrics
                if len(group_errors) > 1:
                    # Difference between highest and lowest error
                    error_disparity = max(group_errors.values()) - min(group_errors.values())
                    # Ratio of highest to lowest error (avoid division by zero)
                    min_error = max(min(group_errors.values()), 0.001)
                    error_ratio = max(group_errors.values()) / min_error

                    fairness_results[attr] = {
                        'group_errors': group_errors,
                        'error_disparity': error_disparity,
                        'error_ratio': error_ratio,
                        'overall_error': overall_abs_error
                    }

                    logger.info(f"Fairness analysis for {attr}:")
                    logger.info(f"  Error disparity: {error_disparity:.4f}")
                    logger.info(f"  Error ratio: {error_ratio:.4f}")

                    # Check for unfairness
                    if error_ratio > 1.5:  # Arbitrary threshold
                        logger.warning(f"Potential unfairness detected for {attr} - error ratio: {error_ratio:.2f}")
            except Exception as e:
                logger.error(f"Error analyzing fairness for attribute {attr}: {e}")
                continue

        return fairness_results

    def plot_fairness_results(self, fairness_results: Dict) -> None:
        """Plot fairness analysis results."""
        if not fairness_results:
            logger.info("No fairness results to plot")
            return

        # Create a plot for each protected attribute
        for attr, results in fairness_results.items():
            try:
                # Bar plot of errors by group
                group_errors = results['group_errors']

                plt.figure(figsize=(10, 6))
                groups = list(group_errors.keys())
                errors = list(group_errors.values())

                # Sort by error value
                sorted_pairs = sorted(zip(groups, errors), key=lambda x: x[1])
                groups, errors = zip(*sorted_pairs)

                # Create bar plot
                ax = sns.barplot(x=list(groups), y=list(errors))
                plt.axhline(y=results['overall_error'], color='r', linestyle='--', label='Overall Error')
                plt.xlabel(f'{attr} Group')
                plt.ylabel('Mean Absolute Error')
                plt.title(f'Error by {attr} Group')
                plt.legend()

                # Rotate x-axis labels if there are many groups
                if len(groups) > 4:
                    plt.xticks(rotation=45, ha='right')

                plt.tight_layout()

                # Save plot if needed
                plot_file = os.path.join(self.config.results_dir, f'fairness_{attr}.png')
                plt.savefig(plot_file)
                logger.info(f"Saved fairness plot to {plot_file}")

                # Display in notebook
                plt.show()

            except Exception as e:
                logger.error(f"Error plotting fairness results for {attr}: {e}")

    def plot_feature_importance(self, model_results: Dict) -> None:
        """Plot feature importance."""
        if not model_results or 'feature_importances' not in model_results:
            logger.info("No feature importance to plot")
            return

        try:
            # Get feature names and importances
            feature_names = model_results['feature_names']
            importances = model_results['feature_importances']

            # Sort by importance
            sorted_idx = np.argsort(importances)

            # Plot only top 20 features if there are many
            if len(feature_names) > 20:
                sorted_idx = sorted_idx[-20:]

            plt.figure(figsize=(10, 8))
            plt.barh(range(len(sorted_idx)), importances[sorted_idx])
            plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
            plt.xlabel('Feature Importance')
            plt.title('Top Features by Importance')
            plt.tight_layout()

            # Save plot
            plot_file = os.path.join(self.config.results_dir, 'feature_importance.png')
            plt.savefig(plot_file)
            logger.info(f"Saved feature importance plot to {plot_file}")

            # Display in notebook
            plt.show()

        except Exception as e:
            logger.error(f"Error plotting feature importance: {e}")



def main():
    """Main function to run the pipeline."""
    # Initialize configuration
    config = Config()
    logger.info(f"Using {config.n_cores} CPU cores for processing")
    logger.info("Starting predictive policing pipeline")

    # Initialize processors
    data_processor = DataProcessor(config)

    # Load and process data
    try:
        logger.info("Loading input data")
        # Check which methods are available in the DataProcessor class
        # Based on the available methods, we'll use the appropriate one
        if hasattr(data_processor, 'import_data'):
            raw_data = data_processor.import_data(config.input_path)
        elif hasattr(data_processor, 'get_data'):
            raw_data = data_processor.get_data(config.input_path)
        else:
            # If no suitable method exists, we'll need to implement one
            logger.info("No data loading method found, implementing custom loader")
            raw_data = load_data_from_path(config.input_path)

        logger.info("Preprocessing data")
        processed_data = data_processor.preprocess(raw_data)

        # Initialize model
        logger.info("Initializing prediction model")
        model = PredictionModel(config)

        # Train or load model
        if config.train_model:
            logger.info("Training predictive model")
            model.train(processed_data)
            logger.info(f"Saving model to {config.model_path}")
            model.save(config.model_path)
        else:
            logger.info(f"Loading model from {config.model_path}")
            model.load(config.model_path)

        # Generate predictions
        logger.info("Generating predictions")
        predictions = model.predict(processed_data)

        # Generate reports
        logger.info("Generating analysis reports")
        report_generator = ReportGenerator(config)
        report_generator.create_reports(processed_data, predictions)

        # Export results
        logger.info("Exporting results")
        exporter = ResultExporter(config)
        exporter.export(predictions, config.output_path)

        logger.info("Pipeline completed successfully")
        return True

    except Exception as e:
        logger.error(f"Pipeline failed: {str(e)}")
        logger.debug(traceback.format_exc())
        return False

# Implement a custom data loading function
def load_data_from_path(path):
    """Load data from the specified path.

    Args:
        path (str): Path to the data file

    Returns:
        data: Loaded data object
    """
    logger.info(f"Loading data from {path}")
    # Determine file type based on extension
    if path.endswith('.csv'):
        import pandas as pd
        return pd.read_csv(path)
    elif path.endswith('.json'):
        import json
        with open(path, 'r') as f:
            return json.load(f)
    elif path.endswith('.xlsx') or path.endswith('.xls'):
        import pandas as pd
        return pd.read_excel(path)
    else:
        # Generic file reading
        with open(path, 'r') as f:
            return f.read()

if __name__ == "__main__":
    main()

ERROR:predictive_policing:Pipeline failed: 'Config' object has no attribute 'input_path'
