# Baseline Experiments & GA Feature Selection Test

This notebook:
1. Loads the cleaned diabetes dataset
2. Runs baseline classification with all features
3. Tests the GA feature selection implementation
4. Compares results

In [31]:
# Import required libraries
import numpy as np
import sys
import os
import importlib

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Import and reload modules to pick up any changes
from src.models import classifier
from src import ga

# Reload to pick up changes
importlib.reload(classifier)
importlib.reload(ga.fitness)
importlib.reload(ga.operators)
importlib.reload(ga.genetic_algorithm)
importlib.reload(ga)

from src.models.classifier import get_decision_tree
from src.ga import GeneticAlgorithm

print("All imports successful!")

All imports successful!


## 1. Load Cleaned Diabetes Dataset

**Fixed data loading to prevent data leakage:**
- ✅ Excludes `PatientID` (non-predictive identifier)
- ✅ Excludes `Diagnosis` (target variable - prevents leakage!)
- ✅ Excludes `DoctorInCharge` (non-predictive identifier)
- ✅ Properly stores feature names for CSV export
- **Result: 48 valid predictive features** (not 51)

In [32]:
import pandas as pd

# Load with pandas to properly handle columns
data_path = '../data/processed/cleaned_diabetes_data.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"All columns ({len(df.columns)}): {list(df.columns)}")

# Define columns to EXCLUDE (non-predictive or target)
exclude_cols = ['PatientID', 'Diagnosis', 'DoctorInCharge']

# Separate features and target
X = df.drop(columns=exclude_cols).values
y = df['Diagnosis'].values

# Get feature names for later use (IMPORTANT: store globally)
feature_names = df.drop(columns=exclude_cols).columns.tolist()

print(f"\nExcluded columns: {exclude_cols}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Unique classes: {np.unique(y)}")
print(f"Class distribution: {np.bincount(y.astype(int))}")
print(f"\nFeature names ({len(feature_names)}): {feature_names}")

Dataset shape: (1879, 52)
All columns (52): ['PatientID', 'Age', 'Gender', 'Ethnicity', 'SocioeconomicStatus', 'EducationLevel', 'BMI', 'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'FamilyHistoryDiabetes', 'GestationalDiabetes', 'PolycysticOvarySyndrome', 'PreviousPreDiabetes', 'Hypertension', 'SystolicBP', 'DiastolicBP', 'FastingBloodSugar', 'HbA1c', 'SerumCreatinine', 'BUNLevels', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'AntihypertensiveMedications', 'Statins', 'AntidiabeticMedications', 'FrequentUrination', 'ExcessiveThirst', 'UnexplainedWeightLoss', 'FatigueLevels', 'BlurredVision', 'SlowHealingSores', 'TinglingHandsFeet', 'QualityOfLifeScore', 'HeavyMetalsExposure', 'OccupationalExposureChemicals', 'WaterQuality', 'MedicalCheckupsFrequency', 'MedicationAdherence', 'HealthLiteracy', 'Diagnosis', 'DoctorInCharge', 'Total_Symptoms', 'Polyuria_Polydipsia', 'Skin_Issues', 'Neuro_Symptoms', 'High_Risk_Com

In [33]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train class distribution: {np.bincount(y_train.astype(int))}")
print(f"Test class distribution: {np.bincount(y_test.astype(int))}")

Training set: 1503 samples
Test set: 376 samples
Train class distribution: [901 602]
Test class distribution: [226 150]


## 2. Baseline: Decision Tree with All Features

In [34]:
# Train baseline classifier with all features
baseline_clf = get_decision_tree(max_depth=10, min_samples_split=5, random_state=42)
baseline_clf.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_clf.predict(X_test)

# Metrics
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline, average='weighted')  # weighted for multiclass

print("=" * 60)
print("BASELINE: All Features")
print("=" * 60)
print(f"Number of features used: {X_train.shape[1]}")
print(f"Test Accuracy: {baseline_accuracy:.4f}")
print(f"Test F1-Score (weighted): {baseline_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_baseline))

BASELINE: All Features
Number of features used: 49
Test Accuracy: 0.9043
Test F1-Score (weighted): 0.9045

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       226
           1       0.87      0.89      0.88       150

    accuracy                           0.90       376
   macro avg       0.90      0.90      0.90       376
weighted avg       0.90      0.90      0.90       376



## 3. GA Feature Selection (Small Test Run)

In [35]:
# Create classifier factory for GA
classifier_factory = lambda: get_decision_tree(max_depth=10, min_samples_split=5, random_state=42)

# Configure GA (small test run)
ga = GeneticAlgorithm(
    n_features=X_train.shape[1],
    population_size=20,  # Small for quick test
    n_generations=10,    # Short test run
    classifier_factory=classifier_factory,
    selection_method='tournament',
    tournament_size=3,
    crossover_method='uniform',
    crossover_prob=0.8,
    mutation_method='bit_flip',
    mutation_prob=0.01,
    adaptive_mutation=False,
    elitism_rate=0.1,
    fitness_cfg={
        'accuracy_weight': 0.7,
        'feature_reduction_weight': 0.3,
        'penalty_threshold': 0.95
    },
    eval_cfg={
        'cv_folds': 3,  # Reduced for speed
        'random_state': 42,
        'metrics': ['accuracy', 'f1_score']
    },
    random_state=42
)

print("GA Configuration:")
print(f"  Population size: {ga.population_size}")
print(f"  Generations: {ga.n_generations}")
print(f"  Selection: {ga.selection_method}")
print(f"  Crossover: {ga.crossover_method}")
print(f"  Mutation: {ga.mutation_method}")
print(f"  Elitism rate: {ga.elitism_rate}")
print(f"  CV folds: {ga.eval_cfg['cv_folds']}")

GA Configuration:
  Population size: 20
  Generations: 10
  Selection: tournament
  Crossover: uniform
  Mutation: bit_flip
  Elitism rate: 0.1
  CV folds: 3


In [36]:
# Run GA feature selection
print("\nRunning GA Feature Selection...")
print("=" * 60)

best_mask, history = ga.evolve(X_train, y_train)

print("=" * 60)
print("\nGA Evolution Complete!")
print(f"Best feature subset: {np.sum(best_mask)} features selected out of {len(best_mask)}")
print(f"Feature reduction: {(1 - np.sum(best_mask)/len(best_mask)) * 100:.1f}%")
print(f"Final best fitness: {history['best_fitness'][-1]:.4f}")


Running GA Feature Selection...
Generation 1/10: Best Fitness = 0.7572, Avg Fitness = 0.6025, Features = 23/49
Generation 1/10: Best Fitness = 0.7572, Avg Fitness = 0.6025, Features = 23/49
Generation 10/10: Best Fitness = 0.8254, Avg Fitness = 0.8171, Features = 13/49
Generation 10/10: Best Fitness = 0.8254, Avg Fitness = 0.8171, Features = 13/49

GA Evolution Complete!
Best feature subset: 13 features selected out of 49
Feature reduction: 73.5%
Final best fitness: 0.8254

GA Evolution Complete!
Best feature subset: 13 features selected out of 49
Feature reduction: 73.5%
Final best fitness: 0.8254


In [37]:
# Enforce minimum number of selected features (at least 20)
min_features = 20
best_mask_adjusted = best_mask.copy()
selected_count = int(np.sum(best_mask_adjusted))

if selected_count < min_features:
    all_indices = np.arange(len(best_mask_adjusted))
    not_selected = all_indices[best_mask_adjusted == 0]
    need_to_add = min_features - selected_count
    best_mask_adjusted[not_selected[:need_to_add]] = 1
    print(f"Adjusted mask from {selected_count} to {np.sum(best_mask_adjusted)} selected features to meet minimum.")
else:
    print(f"Mask already has {selected_count} selected features (>= {min_features}).")

Adjusted mask from 13 to 20 selected features to meet minimum.


## 4. Train Final Model with GA-Selected Features

In [38]:
# Apply adjusted feature mask to data
X_train_selected = X_train[:, best_mask_adjusted == 1]
X_test_selected = X_test[:, best_mask_adjusted == 1]

print(f"Selected features shape: {X_train_selected.shape}")

# Train classifier on adjusted selected features
ga_clf = get_decision_tree(max_depth=10, min_samples_split=5, random_state=42)
ga_clf.fit(X_train_selected, y_train)

# Predictions
y_pred_ga = ga_clf.predict(X_test_selected)

# Metrics (multiclass, weighted F1)
ga_accuracy = accuracy_score(y_test, y_pred_ga)
ga_f1 = f1_score(y_test, y_pred_ga, average='weighted')

print("\n" + "=" * 60)
print("GA-SELECTED FEATURES (ADJUSTED, MIN 20)")
print("=" * 60)
print(f"Number of features used: {X_train_selected.shape[1]} (vs {X_train.shape[1]} baseline)")
print(f"Feature reduction: {(1 - X_train_selected.shape[1]/X_train.shape[1]) * 100:.1f}%")
print(f"Test Accuracy: {ga_accuracy:.4f} (vs {baseline_accuracy:.4f} baseline)")
print(f"Test F1-Score (weighted): {ga_f1:.4f} (vs {baseline_f1:.4f} baseline)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_ga))

Selected features shape: (1503, 20)

GA-SELECTED FEATURES (ADJUSTED, MIN 20)
Number of features used: 20 (vs 49 baseline)
Feature reduction: 59.2%
Test Accuracy: 0.8564 (vs 0.9043 baseline)
Test F1-Score (weighted): 0.8559 (vs 0.9045 baseline)

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       226
           1       0.83      0.80      0.82       150

    accuracy                           0.86       376
   macro avg       0.85      0.85      0.85       376
weighted avg       0.86      0.86      0.86       376



## 5. Summary Comparison

In [39]:
print("\n" + "=" * 70)
print("FINAL COMPARISON: BASELINE vs GA FEATURE SELECTION")
print("=" * 70)
print(f"{'Metric':<30} {'Baseline':<20} {'GA-Selected':<20}")
print("-" * 70)
print(f"{'Number of Features':<30} {X_train.shape[1]:<20} {X_train_selected.shape[1]:<20}")
print(f"{'Feature Reduction (%)':<30} {0:<20} {(1 - X_train_selected.shape[1]/X_train.shape[1]) * 100:<20.1f}")
print(f"{'Test Accuracy':<30} {baseline_accuracy:<20.4f} {ga_accuracy:<20.4f}")
print(f"{'Test F1-Score':<30} {baseline_f1:<20.4f} {ga_f1:<20.4f}")
print(f"{'Accuracy Change':<30} {'':<20} {(ga_accuracy - baseline_accuracy):<+20.4f}")
print("=" * 70)

print("\n✓ GA Implementation Test Complete!")
print(f"✓ Successfully selected {np.sum(best_mask)} features")
print(f"✓ Achieved {(1 - np.sum(best_mask)/len(best_mask)) * 100:.1f}% feature reduction")
print(f"✓ Final fitness: {history['best_fitness'][-1]:.4f}")


FINAL COMPARISON: BASELINE vs GA FEATURE SELECTION
Metric                         Baseline             GA-Selected         
----------------------------------------------------------------------
Number of Features             49                   20                  
Feature Reduction (%)          0                    59.2                
Test Accuracy                  0.9043               0.8564              
Test F1-Score                  0.9045               0.8559              
Accuracy Change                                     -0.0479             

✓ GA Implementation Test Complete!
✓ Successfully selected 13 features
✓ Achieved 73.5% feature reduction
✓ Final fitness: 0.8254


## 6. Save Filtered Training Data to CSV

In [40]:
import pandas as pd

# Ensure output directory exists
output_dir = os.path.join('..', 'data', 'results')
os.makedirs(output_dir, exist_ok=True)

# Load original dataset
data_path = os.path.join('..', 'data', 'processed', 'cleaned_diabetes_data.csv')
original_data = pd.read_csv(data_path)

# Get the names of selected features using the adjusted mask
selected_feature_names = [feature_names[i] for i in range(len(best_mask_adjusted)) if best_mask_adjusted[i] == 1]

print(f"Selected features ({len(selected_feature_names)}): {selected_feature_names}")

# Create filtered dataset: selected features + Diagnosis target
filtered_columns = selected_feature_names + ['Diagnosis']
filtered_data = original_data[filtered_columns]

# Save the training data
training_data_path = os.path.join(output_dir, 'ga_selected_training_data.csv')
filtered_data.to_csv(training_data_path, index=False)

print(f"\n✓ Saved filtered training dataset to: {training_data_path}")
print(f"✓ Dataset shape: {filtered_data.shape[0]} rows × {filtered_data.shape[1]} columns")
print(f"✓ Columns: {len(selected_feature_names)} selected features + 1 target (Diagnosis)")

Selected features (20): ['Age', 'Gender', 'Ethnicity', 'SocioeconomicStatus', 'EducationLevel', 'BMI', 'Smoking', 'AlcoholConsumption', 'GestationalDiabetes', 'Hypertension', 'FastingBloodSugar', 'HbA1c', 'CholesterolTotal', 'AntihypertensiveMedications', 'Statins', 'AntidiabeticMedications', 'OccupationalExposureChemicals', 'WaterQuality', 'Polyuria_Polydipsia', 'BMI_Category']

✓ Saved filtered training dataset to: ..\data\results\ga_selected_training_data.csv
✓ Dataset shape: 1879 rows × 21 columns
✓ Columns: 20 selected features + 1 target (Diagnosis)
