In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
"""
Evolutionary EDA & Feature Engineering - FIXED
Dataset: Adult Income (has missing values, mixed types, needs feature engineering)
"""

import copy
import random
from collections import namedtuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from ucimlrepo import fetch_ucirepo

# ----------------------------
# EDA Configuration Space
# ----------------------------

def random_eda_config():
    """Generate random EDA/preprocessing configuration"""
    return {
        # Missing value handling
        'numeric_impute': random.choice(['mean', 'median', 'mode']),  # Removed 'drop' to avoid index issues
        'categorical_impute': random.choice(['mode', 'constant']),  # Removed 'drop'

        # Outlier handling
        'outlier_method': random.choice(['none', 'iqr', 'zscore', 'clip']),
        'outlier_threshold': random.choice([1.5, 2.0, 3.0]),

        # Feature engineering
        'create_interactions': random.choice([True, False]),
        'create_ratios': random.choice([True, False]),
        'create_binning': random.choice([True, False]),
        'n_bins': random.choice([3, 5, 10]),

        # Encoding (removed onehot due to inconsistency issues)
        'categorical_encoding': random.choice(['label', 'frequency']),

        # Scaling
        'scaling': random.choice(['standard', 'minmax', 'none']),

        # Feature selection after engineering
        'feature_selection': random.choice([True, False]),
        'selection_threshold': random.choice([0.01, 0.05, 0.1]),
    }

def config_to_str(config):
    """Readable string representation"""
    parts = []
    parts.append(f"NumImp:{config['numeric_impute'][:3]}")
    parts.append(f"CatImp:{config['categorical_impute'][:3]}")
    parts.append(f"Out:{config['outlier_method']}")
    parts.append(f"FE:{'I' if config['create_interactions'] else ''}")
    parts.append(f"{'R' if config['create_ratios'] else ''}")
    parts.append(f"{'B' if config['create_binning'] else ''}")
    parts.append(f"Enc:{config['categorical_encoding'][:3]}")
    parts.append(f"Scl:{config['scaling'][:3]}")
    parts.append(f"FS:{config['feature_selection']}")
    return '|'.join(parts)

# ----------------------------
# EDA Pipeline
# ----------------------------

class EDA_Pipeline:
    def __init__(self, config):
        self.config = config
        self.numeric_imputer = None
        self.categorical_imputer = None
        self.scaler = None
        self.label_encoders = {}
        self.feature_names = []
        self.selected_features = None
        self.numeric_cols = []
        self.categorical_cols = []
        self.rf_model = None

    def identify_column_types(self, X):
        """Identify numeric and categorical columns"""
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    def handle_missing_values(self, X, fit=True):
        """Handle missing values in numeric and categorical columns"""
        X = X.copy()

        # Numeric imputation
        if len(self.numeric_cols) > 0:
            strategy = self.config['numeric_impute']
            if strategy == 'mode':
                strategy = 'most_frequent'
            if fit:
                self.numeric_imputer = SimpleImputer(strategy=strategy)
                X[self.numeric_cols] = self.numeric_imputer.fit_transform(X[self.numeric_cols])
            else:
                X[self.numeric_cols] = self.numeric_imputer.transform(X[self.numeric_cols])

        # Categorical imputation
        if len(self.categorical_cols) > 0:
            if self.config['categorical_impute'] == 'mode':
                if fit:
                    self.categorical_imputer = SimpleImputer(strategy='most_frequent')
                    X[self.categorical_cols] = self.categorical_imputer.fit_transform(X[self.categorical_cols])
                else:
                    X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])
            else:  # constant
                if fit:
                    self.categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
                    X[self.categorical_cols] = self.categorical_imputer.fit_transform(X[self.categorical_cols])
                else:
                    X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])

        return X

    def handle_outliers(self, X, fit=True):
        """Handle outliers in numeric columns"""
        X = X.copy()

        if self.config['outlier_method'] == 'none' or len(self.numeric_cols) == 0:
            return X

        for col in self.numeric_cols:
            if self.config['outlier_method'] == 'iqr':
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - self.config['outlier_threshold'] * IQR
                upper = Q3 + self.config['outlier_threshold'] * IQR
                X[col] = X[col].clip(lower, upper)

            elif self.config['outlier_method'] == 'zscore':
                mean = X[col].mean()
                std = X[col].std()
                if std > 0:
                    threshold = self.config['outlier_threshold']
                    X[col] = X[col].clip(mean - threshold * std, mean + threshold * std)

            elif self.config['outlier_method'] == 'clip':
                lower = X[col].quantile(0.01)
                upper = X[col].quantile(0.99)
                X[col] = X[col].clip(lower, upper)

        return X

    def feature_engineering(self, X):
        """Create new features"""
        X = X.copy()

        # Get current numeric columns (after encoding)
        current_numeric = X.select_dtypes(include=[np.number]).columns.tolist()

        if len(current_numeric) < 2:
            return X

        # Limit to first few columns to avoid explosion
        cols_to_use = current_numeric[:min(5, len(current_numeric))]

        # Interaction features
        if self.config['create_interactions']:
            for i in range(len(cols_to_use)):
                for j in range(i+1, len(cols_to_use)):
                    col1, col2 = cols_to_use[i], cols_to_use[j]
                    X[f'{col1}_x_{col2}'] = X[col1] * X[col2]

        # Ratio features
        if self.config['create_ratios']:
            for i in range(len(cols_to_use)):
                for j in range(i+1, len(cols_to_use)):
                    col1, col2 = cols_to_use[i], cols_to_use[j]
                    X[f'{col1}_div_{col2}'] = X[col1] / (X[col2].abs() + 1e-5)

        # Binning
        if self.config['create_binning']:
            for col in cols_to_use[:3]:
                try:
                    X[f'{col}_binned'] = pd.cut(X[col], bins=self.config['n_bins'], labels=False, duplicates='drop')
                except:
                    pass  # Skip if binning fails

        return X

    def encode_categorical(self, X, fit=True):
        """Encode categorical variables"""
        X = X.copy()

        if len(self.categorical_cols) == 0:
            return X

        if self.config['categorical_encoding'] == 'label':
            for col in self.categorical_cols:
                if fit:
                    le = LabelEncoder()
                    X[col] = le.fit_transform(X[col].astype(str))
                    self.label_encoders[col] = le
                else:
                    le = self.label_encoders.get(col)
                    if le:
                        # Handle unseen categories
                        X[col] = X[col].astype(str).apply(lambda x: x if x in le.classes_ else le.classes_[0])
                        X[col] = le.transform(X[col])

        elif self.config['categorical_encoding'] == 'frequency':
            for col in self.categorical_cols:
                if fit:
                    freq_map = X[col].value_counts(normalize=True).to_dict()
                    self.label_encoders[col] = freq_map
                    X[col] = X[col].map(freq_map).fillna(0)
                else:
                    freq_map = self.label_encoders.get(col, {})
                    X[col] = X[col].map(freq_map).fillna(0)

        return X

    def scale_features(self, X, fit=True):
        """Scale numeric features"""
        X = X.copy()

        if self.config['scaling'] == 'none':
            return X

        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

        if len(numeric_cols) == 0:
            return X

        if self.config['scaling'] == 'standard':
            if fit:
                self.scaler = StandardScaler()
                X[numeric_cols] = self.scaler.fit_transform(X[numeric_cols])
            else:
                X[numeric_cols] = self.scaler.transform(X[numeric_cols])

        elif self.config['scaling'] == 'minmax':
            if fit:
                self.scaler = MinMaxScaler()
                X[numeric_cols] = self.scaler.fit_transform(X[numeric_cols])
            else:
                X[numeric_cols] = self.scaler.transform(X[numeric_cols])

        return X

    def select_features(self, X, y, fit=True):
        """Select important features using Random Forest"""
        X = X.copy()

        if not self.config['feature_selection']:
            return X

        if fit:
            # Train RF to get feature importances
            self.rf_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
            self.rf_model.fit(X, y)

            importances = pd.Series(self.rf_model.feature_importances_, index=X.columns)
            threshold = self.config['selection_threshold']
            self.selected_features = importances[importances >= threshold].index.tolist()

            # Keep at least 5 features
            if len(self.selected_features) < 5:
                self.selected_features = importances.nlargest(10).index.tolist()

            return X[self.selected_features]
        else:
            # Use only selected features
            if self.selected_features:
                available_features = [f for f in self.selected_features if f in X.columns]
                if len(available_features) > 0:
                    return X[available_features]
            return X

    def fit_transform(self, X, y):
        """Fit and transform training data"""
        # Ensure X is a DataFrame
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)

        X = X.reset_index(drop=True)
        y = pd.Series(y).reset_index(drop=True)

        self.identify_column_types(X)
        X = self.handle_missing_values(X, fit=True)
        X = self.handle_outliers(X, fit=True)
        X = self.encode_categorical(X, fit=True)
        X = self.feature_engineering(X)
        X = self.scale_features(X, fit=True)
        X = self.select_features(X, y.values, fit=True)

        # Replace inf and nan
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)

        return X.values, y.values

    def transform(self, X):
        """Transform validation/test data"""
        # Ensure X is a DataFrame
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)

        X = X.reset_index(drop=True)

        X = self.handle_missing_values(X, fit=False)
        X = self.handle_outliers(X, fit=False)
        X = self.encode_categorical(X, fit=False)
        X = self.feature_engineering(X)
        X = self.scale_features(X, fit=False)

        # Handle feature selection
        if self.config['feature_selection'] and self.selected_features:
            available_features = [f for f in self.selected_features if f in X.columns]
            missing_features = [f for f in self.selected_features if f not in X.columns]

            # Add missing features with zeros
            for f in missing_features:
                X[f] = 0

            X = X[self.selected_features]

        # Replace inf and nan
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)

        return X.values

# ----------------------------
# Data Loading
# ----------------------------

def load_adult_income_data():
    """Load Adult Income dataset (has missing values, mixed types)"""
    adult = fetch_ucirepo(id=2)
    X = adult.data.features
    y = adult.data.targets

    # Convert target to binary
    y = (y['income'] == '>50K').astype(int).values

    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    return X_train, X_val, X_test, y_train, y_val, y_test

# ----------------------------
# Model Training & Evaluation
# ----------------------------

def train_and_evaluate(X_train, y_train, X_val, y_val):
    """Train simple model and return validation score"""
    try:
        # Use Logistic Regression with class balancing
        model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, class_weight='balanced')
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, zero_division=0)

        # Use weighted score
        score = 0.7 * acc + 0.3 * f1

        return score
    except Exception as e:
        print(f"  Error in training: {e}")
        return 0.0

# ----------------------------
# Evolutionary Algorithm
# ----------------------------

Individual = namedtuple('Individual', ['config', 'fitness', 'pipeline'])

def mutate_config(config):
    """Mutate EDA configuration"""
    new = copy.deepcopy(config)

    # Randomly select field to mutate
    field = random.choice([
        'numeric_impute', 'categorical_impute', 'outlier_method', 'outlier_threshold',
        'create_interactions', 'create_ratios', 'create_binning', 'n_bins',
        'categorical_encoding', 'scaling', 'feature_selection', 'selection_threshold'
    ])

    if field == 'numeric_impute':
        new['numeric_impute'] = random.choice(['mean', 'median', 'mode'])
    elif field == 'categorical_impute':
        new['categorical_impute'] = random.choice(['mode', 'constant'])
    elif field == 'outlier_method':
        new['outlier_method'] = random.choice(['none', 'iqr', 'zscore', 'clip'])
    elif field == 'outlier_threshold':
        new['outlier_threshold'] = random.choice([1.5, 2.0, 3.0])
    elif field == 'create_interactions':
        new['create_interactions'] = not new['create_interactions']
    elif field == 'create_ratios':
        new['create_ratios'] = not new['create_ratios']
    elif field == 'create_binning':
        new['create_binning'] = not new['create_binning']
    elif field == 'n_bins':
        new['n_bins'] = random.choice([3, 5, 10])
    elif field == 'categorical_encoding':
        new['categorical_encoding'] = random.choice(['label', 'frequency'])
    elif field == 'scaling':
        new['scaling'] = random.choice(['standard', 'minmax', 'none'])
    elif field == 'feature_selection':
        new['feature_selection'] = not new['feature_selection']
    elif field == 'selection_threshold':
        new['selection_threshold'] = random.choice([0.01, 0.05, 0.1])

    return new

def evolve(population, X_train, y_train, X_val, y_val, args):
    """Evolve population for one generation"""
    population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)
    next_pop = []

    # Elitism
    K = max(1, int(args.elitism * len(population)))
    next_pop.extend(population[:K])

    # Generate children
    while len(next_pop) < args.pop_size:
        # Tournament selection
        tournament = random.sample(population, k=min(args.tournament_k, len(population)))
        parent = max(tournament, key=lambda x: x.fitness if x.fitness is not None else 0.0)

        # Mutate
        child_config = mutate_config(parent.config)

        # Apply EDA pipeline
        try:
            pipeline = EDA_Pipeline(child_config)
            X_train_proc, y_train_proc = pipeline.fit_transform(X_train.copy(), y_train.copy())
            X_val_proc = pipeline.transform(X_val.copy())

            # Train and evaluate
            fitness = train_and_evaluate(X_train_proc, y_train_proc, X_val_proc, y_val)

            child = Individual(config=child_config, fitness=fitness, pipeline=pipeline)
            next_pop.append(child)

        except Exception as e:
            print(f"  Error processing child: {e}")
            # Add parent instead to maintain population size
            if len(next_pop) < args.pop_size:
                next_pop.append(parent)

    return next_pop[:args.pop_size]

# ----------------------------
# Main Evolution
# ----------------------------

def run_evo_eda(args):
    print("="*60)
    print("Evolutionary EDA & Feature Engineering")
    print("Dataset: Adult Income (Census)")
    print("="*60)

    # Load data
    X_train, X_val, X_test, y_train, y_val, y_test = load_adult_income_data()
    print(f"\nData loaded:")
    print(f"  Train: {len(X_train)} samples, {X_train.shape[1]} features")
    print(f"  Val: {len(X_val)} samples")
    print(f"  Test: {len(X_test)} samples")
    print(f"  Missing values: {X_train.isnull().sum().sum()}")
    print(f"  Class distribution: {np.bincount(y_train)}")

    # Initialize population
    population = []
    print(f"\nInitializing population (size={args.pop_size})...")
    for i in range(args.pop_size):
        config = random_eda_config()

        try:
            pipeline = EDA_Pipeline(config)
            X_train_proc, y_train_proc = pipeline.fit_transform(X_train.copy(), y_train.copy())
            X_val_proc = pipeline.transform(X_val.copy())

            fitness = train_and_evaluate(X_train_proc, y_train_proc, X_val_proc, y_val)

            population.append(Individual(config=config, fitness=fitness, pipeline=pipeline))
            print(f"  Init {i+1}/{args.pop_size}: Score={fitness:.4f} | {config_to_str(config)}")

        except Exception as e:
            print(f"  Init {i+1}/{args.pop_size}: FAILED - {e}")

    # Evolution
    best = None
    for gen in range(1, args.generations + 1):
        print(f"\n{'='*60}")
        print(f"Generation {gen}/{args.generations}")
        print(f"{'='*60}")

        population = evolve(population, X_train, y_train, X_val, y_val, args)
        population = sorted(population, key=lambda x: x.fitness if x.fitness is not None else 0.0, reverse=True)

        best = population[0]
        print(f"  Best Score: {best.fitness:.4f}")
        print(f"  Config: {config_to_str(best.config)}")

    # Final test
    print(f"\n{'='*60}")
    print("FINAL EVALUATION")
    print(f"{'='*60}")

    X_train_final, y_train_final = best.pipeline.fit_transform(X_train.copy(), y_train.copy())
    X_test_final = best.pipeline.transform(X_test.copy())

    model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
    model.fit(X_train_final, y_train_final)

    y_pred_val = model.predict(best.pipeline.transform(X_val.copy()))
    y_pred_test = model.predict(X_test_final)

    val_acc = accuracy_score(y_val, y_pred_val)
    val_f1 = f1_score(y_val, y_pred_val, zero_division=0)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test, zero_division=0)

    print(f"\nBest Configuration:")
    for key, value in best.config.items():
        print(f"  {key}: {value}")

    print(f"\nFinal Performance:")
    print(f"  Validation Accuracy: {val_acc:.4f}")
    print(f"  Validation F1: {val_f1:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    print(f"  Test F1: {test_f1:.4f}")
    print(f"  Final Features: {X_train_final.shape[1]}")

    return best

# ----------------------------
# Run
# ----------------------------

class Args:
    pop_size = 8
    generations = 5
    elitism = 0.25
    tournament_k = 3

args = Args()
best_config = run_evo_eda(args)

Evolutionary EDA & Feature Engineering
Dataset: Adult Income (Census)

Data loaded:
  Train: 34189 samples, 14 features
  Val: 7326 samples
  Test: 7327 samples
  Missing values: 1587
  Class distribution: [28700  5489]

Initializing population (size=8)...
  Init 1/8: Score=0.6717 | NumImp:med|CatImp:con|Out:none|FE:||B|Enc:fre|Scl:min|FS:True
  Init 2/8: Score=0.6549 | NumImp:med|CatImp:con|Out:zscore|FE:I|R||Enc:lab|Scl:sta|FS:True
  Init 3/8: Score=0.6872 | NumImp:med|CatImp:con|Out:zscore|FE:I|||Enc:fre|Scl:sta|FS:True
  Init 4/8: Score=0.6762 | NumImp:med|CatImp:con|Out:zscore|FE:I||B|Enc:fre|Scl:sta|FS:True
  Init 5/8: Score=0.6630 | NumImp:mea|CatImp:mod|Out:clip|FE:||B|Enc:lab|Scl:sta|FS:False
  Init 6/8: Score=0.5607 | NumImp:mod|CatImp:con|Out:iqr|FE:|R|B|Enc:fre|Scl:non|FS:False
  Init 7/8: Score=0.6370 | NumImp:med|CatImp:con|Out:none|FE:I||B|Enc:lab|Scl:sta|FS:True
  Init 8/8: Score=0.6678 | NumImp:mea|CatImp:con|Out:none|FE:|R||Enc:fre|Scl:min|FS:True

Generation 1/5
  Be