In [6]:
"""
Medical Data Preprocessing System
"""
import numpy as np
import pandas as pd
import logging
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DATASET_CONFIG = [
    {
        'path': 'Datasets\heart.csv',
        'target': 'target',
        'target_map': None  # Binary: 0=healthy, 1=disease
    },
    {
        'path': 'Datasets\kidney.csv',
        'target': 'classification',
        'target_map': {'ckd': 1, 'notckd': 0}
    },
    {
        'path': 'Datasets\diabetes.csv',
        'target': 'Outcome',
        'target_map': None  # Binary: 0=negative, 1=positive
    },
    {
        'path': 'Datasets\breastcancer.csv',
        'target': 'diagnosis',
        'target_map': {'M': 1, 'B': 0}
    },
    {
        'path': 'Datasets\liver.csv',
        'target': 'Dataset',
        'target_map': {1: 1, 2: 0}
    }
]

def load_dataset(path: str) -> pd.DataFrame:
    """Load and validate dataset with error handling"""
    try:
        df = pd.read_csv(path)
        logging.info(f"Loaded {path} with shape {df.shape}")
        return df
    except FileNotFoundError:
        logging.error(f"Dataset file {path} not found")
        raise
    except Exception as e:
        logging.error(f"Error loading {path}: {str(e)}")
        raise

def preprocess_features(df: pd.DataFrame, target_col: str) -> np.ndarray:
    """Advanced feature preprocessing pipeline"""
    features = df.drop(columns=[target_col])
    
    numeric_cols = features.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = features.select_dtypes(exclude=np.number).columns.tolist()

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    return preprocessor.fit_transform(features)

def process_target(y: pd.Series, mapping: dict) -> np.ndarray:
    """Convert target to binary numpy array with validation"""
    if mapping is not None:
        y = y.map(mapping)
        if y.isna().any():
            raise ValueError("Invalid target values in mapping")
    
    y = y.astype(int)
    if set(np.unique(y)) != {0, 1}:
        raise ValueError("Target must be binary (0/1)")
    
    return y.to_numpy()

def combine_arrays(arrays: list) -> np.ndarray:
    """Safely combine numpy arrays with shape validation"""
    base_shape = arrays.shape[1:]
    for arr in arrays:
        if arr.shape[1:] != base_shape:
            raise ValueError("All feature arrays must have same dimensions")
    return np.concatenate(arrays, axis=0)

if __name__ == "__main__":
    all_features = []
    all_targets = []
    
    for config in DATASET_CONFIG:
        try:
            # Load and validate
            df = load_dataset(config['path'])
            
            # Preprocess features
            X = preprocess_features(df, config['target'])
            
            # Process target
            y = process_target(df[config['target']], config['target_map'])
            
            # Store processed data
            all_features.append(X)
            all_targets.append(y)
            
            # Save individual dataset
            np.savez_compressed(
                config['path'].replace('.csv', '_processed.npz'),
                features=X,
                target=y
            )
            
        except Exception as e:
            logging.error(f"Failed processing {config['path']}: {str(e)}")
            raise

    # Combine all datasets
    try:
        X_combined = combine_arrays(all_features)
        y_combined = np.concatenate(all_targets)
        
        np.savez_compressed(
            'combined_medical_data.npz',
            features=X_combined,
            target=y_combined
        )
        logging.info(f"Combined dataset shape: Features {X_combined.shape}, Targets {y_combined.shape}")
        
    except ValueError as e:
        logging.error(f"Data combination failed: {str(e)}")
        raise

  'path': 'Datasets\heart.csv',
  'path': 'Datasets\kidney.csv',
  'path': 'Datasets\diabetes.csv',
  'path': 'Datasets\liver.csv',
2025-05-03 18:05:55,329 - INFO - Loaded Datasets\heart.csv with shape (918, 12)
2025-05-03 18:05:55,330 - ERROR - Failed processing Datasets\heart.csv: "['target'] not found in axis"
  'path': 'Datasets\heart.csv',
  'path': 'Datasets\kidney.csv',
  'path': 'Datasets\diabetes.csv',
  'path': 'Datasets\liver.csv',


KeyError: "['target'] not found in axis"