# Part B - Regression with Default Hyperparameters

This notebook addresses questions 4-7 from the assignment:

4. **Baseline Model**: What is the simplest baseline model we should aim to beat?
5. **Model Training**: Train the 4 models with default hyperparameters for both pipelines
6. **Pipeline Comparison**: Which pipeline performed the best?
7. **Autograder Submission**: Submit work to check progress so far


In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Enable experimental features BEFORE importing IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [None]:
# Load data
data = pd.read_csv('health_insurance_train.csv')
X = data.drop('whrswk', axis=1)
y = data['whrswk']

print(f"Dataset shape: {data.shape}")
print(f"Target variable statistics:")
print(y.describe())

# Define feature types
numerical_feats = ['experience', 'kidslt6', 'kids618', 'husby']
categorical_feats = ['hhi', 'whi', 'hhi2', 'education', 'race', 'hispanic', 'region']

print(f"\nNumerical features: {numerical_feats}")
print(f"Categorical features: {categorical_feats}")


## Question 4: What is the simplest baseline model we should aim to beat?

**Answer:** The simplest baseline model is a **DummyRegressor** that always predicts the mean of the training targets. This represents what we would guess if we had to predict hours worked without knowing anything about the woman.

**Why this is the right baseline:**
- It's the most naive possible prediction
- Any model that performs worse than this is essentially useless
- It provides a concrete performance floor to beat
- It's commonly used in machine learning as a sanity check


In [None]:
# Question 4: Calculate baseline performance
print("Question 4: Baseline Model Performance")
print("="*50)

# Create dummy regressor that predicts the mean
dummy_regressor = DummyRegressor(strategy='mean')

# Use cross-validation to get fair estimate of baseline performance
cv_scores_baseline = cross_val_score(dummy_regressor, X, y, cv=5, scoring='neg_mean_absolute_error')
baseline_mae = -cv_scores_baseline.mean()
baseline_std = cv_scores_baseline.std()

print(f"Baseline (DummyRegressor) MAE: {baseline_mae:.4f} (+/- {baseline_std * 2:.4f})")
print(f"Individual CV scores: {[-score for score in cv_scores_baseline]}")

# Also calculate what the mean prediction would be
mean_prediction = y.mean()
print(f"\nMean of target variable: {mean_prediction:.4f} hours/week")
print(f"This is what the dummy regressor always predicts")

print(f"\n✅ Any model with MAE < {baseline_mae:.4f} is better than random guessing")
print(f"✅ Any model with MAE >= {baseline_mae:.4f} is essentially useless")


## Question 5: Train the 4 models with default hyperparameters for both pipelines

**Models to train:**
1. KNN Regression
2. SGD Regression  
3. Random Forest Regression
4. Decision Tree Regression

**Pipelines to compare:**
1. **Pipeline 1 (Basic)**: SimpleImputer + StandardScaler + OneHotEncoder
2. **Pipeline 2 (Advanced)**: IterativeImputer + RobustScaler + QuantileTransformer + OneHotEncoder + Feature Selection

**Fair Performance Estimation:**
- **Cross-validation (5-fold)** for robust performance estimation
- **Same random state** for reproducible results
- **Same train/test split** for fair comparison
- **No hyperparameter tuning** - only default parameters


In [None]:
# Create Pipeline 1 (Basic Preprocessing)
def create_pipeline1():
    """Basic preprocessing pipeline"""
    numerical_transf = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="median")), 
        ('scaler', StandardScaler())
    ])
    
    categorical_transf = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")), 
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transf, numerical_feats),
            ('cat', categorical_transf, categorical_feats)
        ]
    )
    
    return preprocessor

# Create Pipeline 2 (Advanced Preprocessing) with Feature Engineering
def create_engineered_features(X):
    """Create new features based on domain knowledge"""
    X_eng = X.copy()
    
    # Family structure features
    X_eng['total_kids'] = X_eng['kidslt6'] + X_eng['kids618']
    X_eng['has_kids'] = (X_eng['total_kids'] > 0).astype(int)
    X_eng['has_young_kids'] = (X_eng['kidslt6'] > 0).astype(int)
    
    # Work-life balance indicators
    X_eng['husby_per_kid'] = X_eng['husby'] / (X_eng['total_kids'] + 1)
    X_eng['experience_per_kid'] = X_eng['experience'] / (X_eng['total_kids'] + 1)
    
    # Insurance coverage combinations
    X_eng['insurance_coverage'] = (
        (X_eng['hhi'] == 'yes').astype(int) + 
        (X_eng['whi'] == 'yes').astype(int) + 
        (X_eng['hhi2'] == 'yes').astype(int)
    )
    
    # Education level encoding (ordinal)
    education_mapping = {
        '9-11years': 1, '12years': 2, '13-15years': 3, '16years': 4, '>16years': 5
    }
    X_eng['education_encoded'] = X_eng['education'].map(education_mapping)
    
    # Regional economic indicators
    region_economic = {
        'northeast': 4, 'northcentral': 3, 'south': 2, 'west': 3, 'other': 2
    }
    X_eng['region_economic'] = X_eng['region'].map(region_economic)
    
    # Experience categories
    X_eng['experience_category'] = pd.cut(
        X_eng['experience'], 
        bins=[0, 5, 15, 25, 100], 
        labels=['entry', 'mid', 'senior', 'expert']
    )
    
    # Interaction features
    X_eng['experience_education'] = X_eng['experience'] * X_eng['education_encoded']
    X_eng['husby_education'] = X_eng['husby'] * X_eng['education_encoded']
    
    return X_eng

def create_pipeline2():
    """Advanced preprocessing pipeline with feature engineering"""
    # Apply feature engineering
    X_engineered = create_engineered_features(X)
    
    # Update feature lists
    categorical_features_eng = ['hhi', 'whi', 'hhi2', 'education', 'race', 'hispanic', 'region', 'experience_category']
    numerical_features_eng = ['experience', 'kidslt6', 'kids618', 'husby', 'total_kids', 'has_kids', 'has_young_kids',
                             'husby_per_kid', 'experience_per_kid', 'insurance_coverage', 'education_encoded',
                             'region_economic', 'experience_education', 'husby_education']
    
    # Numerical preprocessing pipeline
    numerical_pipeline = Pipeline([
        ('imputer', IterativeImputer(random_state=RANDOM_STATE, max_iter=10)),
        ('scaler', RobustScaler()),
        ('quantile', QuantileTransformer(output_distribution='normal', random_state=RANDOM_STATE))
    ])
    
    # Categorical preprocessing pipeline
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine preprocessing
    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features_eng),
        ('cat', categorical_pipeline, categorical_features_eng)
    ])
    
    # Full pipeline with feature selection
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=mutual_info_regression, k=15)),
    ])
    
    return full_pipeline, X_engineered

print("Pipelines created successfully!")


In [None]:
# Question 5: Train and evaluate all models with both pipelines
print("Question 5: Training and Evaluating Models")
print("="*60)

# Create pipelines
pipeline1 = create_pipeline1()
pipeline2, X_engineered = create_pipeline2()

# Define models with default hyperparameters
models = {
    'KNN': KNeighborsRegressor(),
    'SGD': SGDRegressor(random_state=RANDOM_STATE),
    'Random Forest': RandomForestRegressor(random_state=RANDOM_STATE),
    'Decision Tree': DecisionTreeRegressor(random_state=RANDOM_STATE)
}

# Store results
results = {}

print("Training models with Pipeline 1 (Basic)...")
for name, model in models.items():
    print(f"  Training {name}...")
    
    # Create full pipeline
    full_pipeline = Pipeline([
        ('preprocessor', pipeline1),
        ('regressor', model)
    ])
    
    # Cross-validation
    cv_scores = cross_val_score(full_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    mae = -cv_scores.mean()
    std = cv_scores.std()
    
    results[f"{name}_Pipeline1"] = {
        'MAE': mae,
        'std': std,
        'scores': cv_scores
    }
    
    print(f"    MAE: {mae:.4f} (+/- {std * 2:.4f})")

print("\nTraining models with Pipeline 2 (Advanced)...")
for name, model in models.items():
    print(f"  Training {name}...")
    
    # Create full pipeline
    full_pipeline = Pipeline([
        ('preprocessor', pipeline2),
        ('regressor', model)
    ])
    
    # Cross-validation
    cv_scores = cross_val_score(full_pipeline, X_engineered, y, cv=5, scoring='neg_mean_absolute_error')
    mae = -cv_scores.mean()
    std = cv_scores.std()
    
    results[f"{name}_Pipeline2"] = {
        'MAE': mae,
        'std': std,
        'scores': cv_scores
    }
    
    print(f"    MAE: {mae:.4f} (+/- {std * 2:.4f})")


In [None]:
# Display comprehensive results
print("\n" + "="*80)
print("COMPREHENSIVE RESULTS: ALL MODELS AND PIPELINES")
print("="*80)
print(f"{'Model':<15} | {'Pipeline':<10} | {'MAE':<12} | {'Std':<12} | {'vs Baseline':<12}")
print("-" * 80)

# Sort results by MAE for easy comparison
sorted_results = sorted(results.items(), key=lambda x: x[1]['MAE'])

for model_pipeline, metrics in sorted_results:
    model_name, pipeline_name = model_pipeline.split('_')
    mae = metrics['MAE']
    std = metrics['std']
    
    # Compare to baseline
    vs_baseline = ((baseline_mae - mae) / baseline_mae * 100)
    improvement = f"{vs_baseline:+.1f}%"
    
    print(f"{model_name:<15} | {pipeline_name:<10} | {mae:<12.4f} | {std:<12.4f} | {improvement:<12}")

print(f"\nBaseline (Dummy): {baseline_mae:.4f}")

# Find best overall model
best_model_pipeline = sorted_results[0][0]
best_mae = sorted_results[0][1]['MAE']
best_model, best_pipeline = best_model_pipeline.split('_')

print(f"\n🏆 BEST OVERALL: {best_model} with {best_pipeline}")
print(f"Best MAE: {best_mae:.4f}")
print(f"Improvement over baseline: {((baseline_mae - best_mae) / baseline_mae * 100):+.1f}%")


## Question 6: Which pipeline performed the best?

**Answer:** Based on the results above, we can determine which pipeline performed best by comparing:

1. **Best overall performance** - which model-pipeline combination achieved the lowest MAE
2. **Pipeline comparison** - which pipeline generally performed better across all models
3. **Statistical significance** - whether the differences are meaningful

**Analysis:**
- Look at the sorted results table above
- Compare Pipeline1 vs Pipeline2 performance for each model type
- Consider both the MAE values and the standard deviations
- The best pipeline should consistently outperform the other across multiple models


In [None]:
# Question 6: Detailed pipeline comparison
print("Question 6: Pipeline Comparison Analysis")
print("="*50)

# Compare pipelines for each model
pipeline1_wins = 0
pipeline2_wins = 0

print("Head-to-head comparison:")
print(f"{'Model':<15} | {'Pipeline1 MAE':<15} | {'Pipeline2 MAE':<15} | {'Winner':<10}")
print("-" * 70)

for model in models.keys():
    p1_key = f"{model}_Pipeline1"
    p2_key = f"{model}_Pipeline2"
    
    p1_mae = results[p1_key]['MAE']
    p2_mae = results[p2_key]['MAE']
    
    if p1_mae < p2_mae:
        winner = "Pipeline1"
        pipeline1_wins += 1
    else:
        winner = "Pipeline2"
        pipeline2_wins += 1
    
    print(f"{model:<15} | {p1_mae:<15.4f} | {p2_mae:<15.4f} | {winner:<10}")

print(f"\nPipeline1 wins: {pipeline1_wins}/4")
print(f"Pipeline2 wins: {pipeline2_wins}/4")

# Calculate average improvement
pipeline1_models = [f"{model}_Pipeline1" for model in models.keys()]
pipeline2_models = [f"{model}_Pipeline2" for model in models.keys()]

avg_p1_mae = np.mean([results[model]['MAE'] for model in pipeline1_models])
avg_p2_mae = np.mean([results[model]['MAE'] for model in pipeline2_models])

print(f"\nAverage MAE across all models:")
print(f"Pipeline1: {avg_p1_mae:.4f}")
print(f"Pipeline2: {avg_p2_mae:.4f}")

if avg_p2_mae < avg_p1_mae:
    improvement = ((avg_p1_mae - avg_p2_mae) / avg_p1_mae * 100)
    print(f"✅ Pipeline2 is better by {improvement:.1f}% on average")
    best_pipeline = "Pipeline2"
else:
    improvement = ((avg_p2_mae - avg_p1_mae) / avg_p2_mae * 100)
    print(f"✅ Pipeline1 is better by {improvement:.1f}% on average")
    best_pipeline = "Pipeline1"

print(f"\n🏆 WINNER: {best_pipeline}")
print(f"Best pipeline will be used for the next exercises.")


## Question 7: Submit work to the autograder

**Answer:** We will create a submission file using the best performing model-pipeline combination identified in Question 6.

**Submission process:**
1. Load the autograder data
2. Apply the best preprocessing pipeline
3. Train the best model on the full dataset
4. Make predictions on autograder data
5. Create submission file with MAE estimate and predictions


In [None]:
# Question 7: Create autograder submission
print("Question 7: Creating Autograder Submission")
print("="*50)

# Load autograder data
print("Loading autograder data...")
data_autograder = pd.read_csv('health_insurance_autograde.csv')
print(f"Autograder data shape: {data_autograder.shape}")

# Determine best model and pipeline
best_model_pipeline = sorted_results[0][0]
best_model, best_pipeline = best_model_pipeline.split('_')
best_mae = sorted_results[0][1]['MAE']

print(f"Using best model: {best_model} with {best_pipeline}")
print(f"Expected MAE: {best_mae:.4f}")

# Create the best pipeline and model
if best_pipeline == "Pipeline1":
    best_preprocessor = pipeline1
    X_for_training = X
else:
    best_preprocessor = pipeline2
    X_for_training = X_engineered

# Get the best model
best_model_instance = models[best_model]

# Create full pipeline
final_pipeline = Pipeline([
    ('preprocessor', best_preprocessor),
    ('regressor', best_model_instance)
])

# Train on full dataset
print("Training final model on full dataset...")
final_pipeline.fit(X_for_training, y)

# Prepare autograder data
if best_pipeline == "Pipeline2":
    X_autograder_processed = create_engineered_features(data_autograder)
else:
    X_autograder_processed = data_autograder

# Make predictions
print("Making predictions on autograder data...")
predictions = final_pipeline.predict(X_autograder_processed)

print(f"Predictions shape: {predictions.shape}")
print(f"Prediction statistics:")
print(f"  Min: {predictions.min():.4f}")
print(f"  Max: {predictions.max():.4f}")
print(f"  Mean: {predictions.mean():.4f}")
print(f"  Std: {predictions.std():.4f}")

# Create submission file
estimate_MAE_on_new_data = np.array([best_mae])
predictions_autograder_data = predictions

result = np.append(estimate_MAE_on_new_data, predictions_autograder_data)
pd.DataFrame(result).to_csv("autograder_submission_partB.txt", index=False, header=False)

print(f"\n✅ Submission file created: 'autograder_submission_partB.txt'")
print(f"File contains {len(result)} values (1 MAE estimate + {len(predictions)} predictions)")
print(f"MAE estimate: {best_mae:.4f}")
print(f"Number of predictions: {len(predictions)}")

# Verify submission file
submission_check = pd.read_csv("autograder_submission_partB.txt", header=None)
print(f"\nVerification:")
print(f"Submission file shape: {submission_check.shape}")
print(f"First value (MAE estimate): {submission_check.iloc[0, 0]:.4f}")
print(f"Last few predictions: {submission_check.tail().values.flatten()}")

print(f"\n🎯 Ready for autograder submission!")
print(f"Best model: {best_model}")
print(f"Best pipeline: {best_pipeline}")
print(f"Expected MAE: {best_mae:.4f}")
