In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')
import joblib
from datasets import load_dataset
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# === PART 1: Initial Model Analysis ===

print("=" * 60)
print("PART 1: INITIAL REGRESSION ANALYSIS")
print("=" * 60)

# Starting with the synthetic data generated earlier.
print("\n1. Using the generated synthetic dataset for initial analysis...")

# Load the synthetic DataFrame. Add a check in case the previous cell wasn't run.
try:
    df = df_synth
    print("Synthetic dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    display(df.head())
except NameError:
    print("Error: `df_synth` not found. Make sure the synthetic data generation cell ran first.")
    # Can't continue without data.
    raise NameError("df_synth is not defined. Please run the synthetic data generation cell first.")


# Prepare features (X) and target (y). Using the two features for now.
X = df[['feature_1', 'feature_2']].values
y = df['target'].values


print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data into training and validation sets (80/20 split).
# Fixed random_state for reproducible splits.
X_train_init, X_val_init, y_train_init, y_val_init = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nInitial split sizes:")
print(f"Training set: {X_train_init.shape}")
print(f"Validation set: {X_val_init.shape}")

# Define a few models to compare.
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0), # Default Ridge regularization
    'Polynomial (degree=2)': Pipeline([ # Use a pipeline for poly features, scaling, and linear model
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()), # Scale after creating polynomial features
        ('linear', LinearRegression())
    ]),
    'Polynomial (degree=4)': Pipeline([ # Higher degree polynomial
        ('poly', PolynomialFeatures(degree=4, include_bias=False)),
        ('scaler', StandardScaler()),
        ('linear', LinearRegression())
    ]),
    'Polynomial (degree=10)': Pipeline([ # Very high degree - expecting overfitting
        ('poly', PolynomialFeatures(degree=10, include_bias=False)),
        ('scaler', StandardScaler()),
        ('linear', LinearRegression())
    ])
}

# Train each model and collect performance metrics.
print("\n2. Training models and calculating metrics...")
results = {} # Store results here

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Scale features unless the pipeline handles it.
    if 'Polynomial' not in name: # Polynomial pipelines have their own scaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_init) # Fit on training
        X_val_scaled = scaler.transform(X_val_init) # Transform validation (don't refit!)
        # Train on scaled data
        model.fit(X_train_scaled, y_train_init)
        # Predict
        y_train_pred = model.predict(X_train_scaled)
        y_val_pred = model.predict(X_val_scaled)
    else:
        # Fit the entire pipeline
        model.fit(X_train_init, y_train_init) # Fit pipeline on unscaled data
        # Predictions come from the pipeline
        y_train_pred = model.predict(X_train_init)
        y_val_pred = model.predict(X_val_init)
        scaler = None # Scaler is inside the pipeline

    # Calculate regression metrics (MSE, RMSE, R2)
    train_mse = mean_squared_error(y_train_init, y_train_pred)
    val_mse = mean_squared_error(y_val_init, y_val_pred)
    train_rmse = np.sqrt(train_mse)
    val_rmse = np.sqrt(val_mse)
    train_r2 = r2_score(y_train_init, y_train_pred)
    val_r2 = r2_score(y_val_init, y_val_pred)

    # Store results
    results[name] = {
        'model': model,
        'scaler': scaler, # Save separate scaler if used
        'train_mse': train_mse,
        'val_mse': val_mse,
        'train_rmse': train_rmse,
        'val_rmse': val_rmse,
        'train_r2': train_r2,
        'val_r2': val_r2
    }

    print(f"  Train MSE: {train_mse:.4f}, Val MSE: {val_mse:.4f}")
    print(f"  Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
    print(f"  Train R2: {train_r2:.4f}, Val R2: {val_r2:.4f}")

# Visualize the results.
print("\n3. Creating visualizations...")

fig, axes = plt.subplots(1, 3, figsize=(18, 5)) # Three plots
fig.suptitle('Model Performance Comparison (Initial Analysis)', fontsize=16, fontweight='bold')

# Data for plotting
model_names = list(results.keys())
train_mse_values = [results[name]['train_mse'] for name in model_names]
val_mse_values = [results[name]['val_mse'] for name in model_names]
train_rmse_values = [results[name]['train_rmse'] for name in model_names]
val_rmse_values = [results[name]['val_rmse'] for name in model_names]
train_r2_values = [results[name]['train_r2'] for name in model_names]
val_r2_values = [results[name]['val_r2'] for name in model_names]

x_pos = np.arange(len(model_names)) # X-axis positions

# MSE Plot
axes[0].plot(x_pos, train_mse_values, 'o-', linewidth=2, markersize=8,
             label='Training', color='#2E86AB')
axes[0].plot(x_pos, val_mse_values, 's-', linewidth=2, markersize=8,
             label='Validation', color='#A23B72')
axes[0].set_xlabel('Model', fontweight='bold')
axes[0].set_ylabel('MSE', fontweight='bold')
axes[0].set_title('Mean Squared Error', fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels([name.replace(' Regression', '').replace('Polynomial ', 'Poly ')
                         for name in model_names], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RMSE Plot
axes[1].plot(x_pos, train_rmse_values, 'o-', linewidth=2, markersize=8,
             label='Training', color='#2E86AB')
axes[1].plot(x_pos, val_rmse_values, 's-', linewidth=2, markersize=8,
             label='Validation', color='#A23B72')
axes[1].set_xlabel('Model', fontweight='bold')
axes[1].set_ylabel('RMSE', fontweight='bold')
axes[1].set_title('Root Mean Squared Error', fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels([name.replace(' Regression', '').replace('Polynomial ', 'Poly ')
                         for name in model_names], rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# R2 Plot
axes[2].plot(x_pos, train_r2_values, 'o-', linewidth=2, markersize=8,
             label='Training', color='#2E86AB')
axes[2].plot(x_pos, val_r2_values, 's-', linewidth=2, markersize=8,
             label='Validation', color='#A23B72')
axes[2].set_xlabel('Model', fontweight='bold')
axes[2].set_ylabel('R² Score', fontweight='bold')
axes[2].set_title('R² Score', fontweight='bold')
axes[2].set_xticks(x_pos)
axes[2].set_xticklabels([name.replace(' Regression', '').replace('Polynomial ', 'Poly ')
                         for name in model_names], rotation=45, ha='right')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].set_ylim(0, 1.05)

plt.tight_layout()
plt.show()

# Select the best model from this initial run based on validation MSE and minimal overfitting.
print("\n4. Selecting best model based on initial analysis...")
# Simple thresholds for detecting overfitting (can be tweaked).
overfitting_threshold_ratio = 0.1
overfitting_threshold_abs = 0.5

best_model_name = None
best_val_mse = float('inf')
initial_best_val_mse = float('inf')

# Find the best model based on criteria.
for name, metrics in results.items():
    train_mse = metrics['train_mse']
    val_mse = metrics['val_mse']

    # Check for overfitting.
    is_overfitting = False
    if train_mse > 1e-9:
        if (val_mse - train_mse) / train_mse > overfitting_threshold_ratio and val_mse > train_mse:
             is_overfitting = True
    elif val_mse > overfitting_threshold_abs:
         is_overfitting = True

    # Update best model if current model is better and not significantly overfitting.
    if val_mse < best_val_mse and not is_overfitting:
        best_val_mse = val_mse
        best_model_name = name
        initial_best_val_mse = val_mse

# If all models show some overfitting, just pick the one with the lowest validation MSE.
if best_model_name is None:
    print("Warning: Overfitting detected in all models based on current thresholds. Selecting model with lowest validation MSE.")
    best_model_name = min(results.keys(), key=lambda k: results[k]['val_mse'])
    initial_best_val_mse = results[best_model_name]['val_mse']

print(f"\nBest model from initial analysis: {best_model_name}")
print(f"Validation MSE: {results[best_model_name]['val_mse']:.4f}")
print(f"Validation RMSE: {results[best_model_name]['val_rmse']:.4f}")
print(f"Validation R2: {results[best_model_name]['val_r2']:.4f}")

# Perform cross-validation on the best model using the initial training data for a more robust estimate.
print(f"\n5. Performing 5-fold cross-validation on {best_model_name} using initial training data...")

best_model = results[best_model_name]['model']
best_scaler = results[best_model_name]['scaler']

# Scale data for CV if needed.
if 'Polynomial' not in best_model_name and best_scaler is not None:
    # Use transform as scaler was fitted on X_train_init.
    X_train_init_scaled_for_cv = best_scaler.transform(X_train_init)
else:
     # Use original data for polynomial pipelines or models not needing scaling.
    X_train_init_scaled_for_cv = X_train_init


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# cross_val_score returns negative MSE, so negate it.
cv_scores = -cross_val_score(best_model, X_train_init_scaled_for_cv, y_train_init, cv=kfold,
                             scoring='neg_mean_squared_error')

print(f"Cross-validation MSE scores (on initial training data): {cv_scores}")
print(f"Mean CV MSE: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Compare against a simple Neural Network.
print("\n6. Training Neural Network for comparison (using initial train/val split)...")

# Scale data for the NN.
scaler_nn = StandardScaler()
X_train_nn = scaler_nn.fit_transform(X_train_init)
X_val_nn = scaler_nn.transform(X_val_init)


nn_model = MLPRegressor(
    hidden_layer_sizes=(100, 50, 25),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

nn_model.fit(X_train_nn, y_train_init)

y_train_pred_nn = nn_model.predict(X_train_nn)
y_val_pred_nn = nn_model.predict(X_val_nn)

nn_train_mse = mean_squared_error(y_train_init, y_train_pred_nn)
nn_val_mse = mean_squared_error(y_val_init, y_val_pred_nn)
nn_train_rmse = np.sqrt(nn_train_mse)
nn_val_rmse = np.sqrt(nn_val_mse)
nn_train_r2 = r2_score(y_train_init, y_train_pred_nn)
nn_val_r2 = r2_score(y_val_init, y_val_pred_nn)

print(f"\nNeural Network Results (Initial Train/Val):")
print(f"  Train MSE: {nn_train_mse:.4f}, Val MSE: {nn_val_mse:.4f}")
print(f"  Train RMSE: {nn_train_rmse:.4f}, Val RMSE: {nn_val_rmse:.4f}")
print(f"  Train R2: {nn_train_r2:.4f}, Val R2: {nn_val_r2:.4f}")

print(f"\nNN Val MSE vs {best_model_name} Val MSE:")
print(f"  {best_model_name} Val MSE: {results[best_model_name]['val_mse']:.4f}")
print(f"  Neural Network Val MSE: {nn_val_mse:.4f}")
print(f"  Difference (NN - Best Model): {nn_val_mse - results[best_model_name]['val_mse']:.4f}")

# Now for the more rigorous part: professional train-val-test split.
# This allows for proper tuning and unbiased evaluation.

print("\n" + "=" * 60)
print("PART 2: PROFESSIONAL TRAIN-VAL-TEST SPLIT AND FINAL MODEL")
print("=" * 60)

# Step 7: Professional data splitting - the standard approach.
# Use the full dataset for this split.
print("\n7. Creating professional train-val-test split...")

# Use the full dataset 'df'.
X_full = df[['feature_1', 'feature_2']].values
y_full = df['target'].values


# Split 80% for training/validation and 20% for the final test set.
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42 # Reproducible split
)

# Split the 80% (X_trainval, y_trainval) into training (60% of total) and validation (20% of total).
# 0.25 of 80% is 20% of the total.
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42
)

print(f"Final split sizes:")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X_full)*100:.1f}%)")
print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X_full)*100:.1f}%)")
print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X_full)*100:.1f}%) - Only used ONCE at the very end!")
print(f"  Total: {len(X_full)} samples")

# Step 8: Ridge Hyperparameter Tuning on the professional train/val split.
print("\n8. Ridge Regression Hyperparameter Tuning (using professional train/val split)...")

# Range of alpha values to test (log scale is typical).
alphas = [0.01, 0.1, 1, 2, 4, 10, 20, 100]
ridge_results = { # Store tuning results
    'alpha': [],
    'train_mse': [],
    'val_mse': [],
    'train_rmse': [],
    'val_rmse': [],
    'train_r2': [],
    'val_r2': []
}

# Scale data for Ridge tuning. Fit on training, transform on validation.
scaler_ridge_tuning = StandardScaler()
X_train_scaled_ridge = scaler_ridge_tuning.fit_transform(X_train)
X_val_scaled_ridge = scaler_ridge_tuning.transform(X_val)


# Train Ridge for each alpha and record metrics.
for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_scaled_ridge, y_train) # Train on scaled training data

    y_train_pred = ridge_model.predict(X_train_scaled_ridge)
    y_val_pred = ridge_model.predict(X_val_scaled_ridge) # Predict on scaled validation data

    train_mse = mean_squared_error(y_train, y_train_pred)
    val_mse = mean_squared_error(y_val, y_val_pred)

    ridge_results['alpha'].append(alpha)
    ridge_results['train_mse'].append(train_mse)
    ridge_results['val_mse'].append(val_mse)
    ridge_results['train_rmse'].append(np.sqrt(train_mse))
    ridge_results['val_rmse'].append(np.sqrt(val_mse))
    ridge_results['train_r2'].append(r2_score(y_train, y_train_pred))
    ridge_results['val_r2'].append(r2_score(y_val, y_val_pred))

    print(f"Alpha={alpha}: Val MSE={val_mse:.4f}, Val R²={ridge_results['val_r2'][-1]:.4f}")

# Plot tuning results.
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Ridge Regression Hyperparameter Tuning (Professional Split)', fontsize=16, fontweight='bold')

# MSE vs Alpha plot
axes[0].plot(ridge_results['alpha'], ridge_results['train_mse'],
            'o-', label='Training', linewidth=2.5, markersize=8, color='#3498db')
axes[0].plot(ridge_results['alpha'], ridge_results['val_mse'],
            's-', label='Validation', linewidth=2.5, markersize=8, color='#e74c3c')
axes[0].set_xlabel('Alpha', fontweight='bold')
axes[0].set_ylabel('MSE', fontweight='bold')
axes[0].set_title('Mean Squared Error vs Alpha', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xscale('log') # Use log scale for alpha

# RMSE vs Alpha plot
axes[1].plot(ridge_results['alpha'], ridge_results['train_rmse'],
            'o-', label='Training', linewidth=2.5, markersize=8, color='#9b59b6')
axes[1].plot(ridge_results['alpha'], ridge_results['val_rmse'],
            's-', label='Validation', linewidth=2.5, markersize=8, color='#f39c12')
axes[1].set_xlabel('Alpha', fontweight='bold')
axes[1].set_ylabel('RMSE', fontweight='bold')
axes[1].set_title('Root Mean Squared Error vs Alpha', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale('log')

# R² vs Alpha plot
axes[2].plot(ridge_results['alpha'], ridge_results['train_r2'],
            'o-', label='Training', linewidth=2.5, markersize=8, color='#2ecc71')
axes[2].plot(ridge_results['alpha'], ridge_results['val_r2'],
            's-', label='Validation', linewidth=2.5, markersize=8, color='#e67e22')
axes[2].set_xlabel('Alpha', fontweight='bold')
axes[2].set_ylabel('R² Score', fontweight='bold')
axes[2].set_title('R² Score vs Alpha', fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].set_ylim(0, 1.05)
axes[2].set_xscale('log')

plt.tight_layout()
plt.show()

# Find the alpha with the lowest validation MSE.
best_alpha_idx = np.argmin(ridge_results['val_mse'])
best_alpha = ridge_results['alpha'][best_alpha_idx]
ridge_tuned_best_val_mse = ridge_results['val_mse'][best_alpha_idx] # Store best MSE from tuning
print(f"\nBest Alpha for Ridge (based on Professional Validation MSE): {best_alpha}")
print(f"Best Ridge Validation MSE: {ridge_tuned_best_val_mse:.4f}")


# Step 9: Train the final model.
# Use the best model type from the initial analysis, but potentially update based on Ridge tuning.
print("\n9. Training final model on professional split...")

# Start with the best model from the initial analysis.
final_best_model_name = best_model_name
final_best_model_obj = results[best_model_name]['model']


# If the tuned Ridge model performed better on its validation set
# than the initial best model did on its initial validation set,
# then Ridge is the chosen final model.
if ridge_tuned_best_val_mse < initial_best_val_mse:
    print(f"Ridge with alpha={best_alpha} from tuning performed better than the initially selected best model ({best_model_name}). Selecting Ridge as the final model.")
    final_best_model_name = f"Ridge (alpha={best_alpha})"
    final_best_model_obj = Ridge(alpha=best_alpha) # Create the final Ridge model
else:
     print(f"Initially selected best model ({best_model_name}) is still the champion after comparing against tuned Ridge.")


print(f"Training final model: {final_best_model_name}...")

# Retrain the chosen model on the combined training + validation data from the professional split.
print("Retraining final best model on combined training and validation data...")
X_train_val = np.vstack((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))

# Fit a NEW scaler on this combined data. This will be used for the test set.
scaler_final = StandardScaler()


# Fit the final model on the combined data. Handle pipelines vs standalone models.
if 'Polynomial' in final_best_model_name:
    # For Polynomial Pipelines, fit the whole pipeline on unscaled data.
    if isinstance(final_best_model_obj, Pipeline):
         print("Fitting Polynomial Pipeline on unscaled train+val data...")
         final_best_model_obj.fit(X_train_val, y_train_val)
         # Extract the fitted scaler from the pipeline to save.
         final_scaler_to_save = final_best_model_obj.named_steps['scaler']
    else:
         # Fallback for non-pipeline Polynomials (shouldn't happen with current model definitions).
         print("Warning: Retraining non-pipeline Polynomial model. Manually handling poly features and scaling.")
         poly_features = PolynomialFeatures(degree=int(final_best_model_name.split('degree=')[1].split(')')[0]), include_bias=False)
         X_train_val_poly = poly_features.fit_transform(X_train_val)
         X_train_val_scaled = scaler_final.fit_transform(X_train_val_poly) # Fit scaler
         final_best_model_obj.fit(X_train_val_scaled, y_train_val)
         final_scaler_to_save = scaler_final # Save this scaler


else:
    # For other models, scale combined data first, then fit the model.
    print(f"Fitting {final_best_model_name} on scaled train+val data...")
    X_train_val_scaled = scaler_final.fit_transform(X_train_val) # Fit scaler on combined data
    final_best_model_obj.fit(X_train_val_scaled, y_train_val)
    final_scaler_to_save = scaler_final # This is the scaler to save

# Step 10: Save the model and the scaler for later use.
print("\n10. Saving the final model and scaler...")
joblib.dump(final_best_model_obj, 'final_regression_model.pkl')
joblib.dump(final_scaler_to_save, 'final_scaler.pkl')
print("Model saved as 'final_regression_model.pkl'")
print("Scaler saved as 'final_scaler.pkl'")


# Step 11: Load the saved model and evaluate on the held-out test set.
# This provides an unbiased performance estimate. Use the test set ONLY ONCE here.
print("\n11. Loading model and testing on the held-out test set...")
loaded_model = joblib.load('final_regression_model.pkl')
loaded_scaler = joblib.load('final_scaler.pkl')

# Prepare test data: apply the scaler fitted on combined train+val data.
X_test_final_scaled = loaded_scaler.transform(X_test) # Use transform!


# Make predictions on the test data. Handle pipelines vs standalone models.
if isinstance(loaded_model, Pipeline) and 'poly' in loaded_model.named_steps:
     # Polynomial Pipeline expects unscaled test data.
     print("Predicting using the loaded Polynomial Pipeline on unscaled test data...")
     y_test_pred = loaded_model.predict(X_test) # Predict on unscaled X_test
else:
     # For other models, predict on scaled test data.
     print(f"Predicting using the loaded {final_best_model_name} on scaled test data...")
     y_test_pred = loaded_model.predict(X_test_final_scaled) # Predict on scaled X_test


# Evaluate performance on the test set.
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("\n" + "=" * 60)
print("FINAL TEST SET RESULTS (Unbiased Evaluation)")
print("=" * 60)
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2: {test_r2:.4f}")

# Step 12: Final visualization comparing key results.
print("\n12. Creating final comparison visualization...")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Final Model Comparison: Initial Best vs Neural Network (Val) vs Final Test Result',
             fontsize=16, fontweight='bold')

# Data for final comparison plot.
# Using validation metrics for Initial Best and NN.
# Using test metrics for the Final Model.
models_comparison = ['Initial Best\n(' + best_model_name + ')', 'Neural Network\n(Val)', 'Final Model\n(' + final_best_model_name + ')']
metrics_comparison = {
    'MSE': [results[best_model_name]['val_mse'], nn_val_mse, test_mse],
    'RMSE': [results[best_model_name]['val_rmse'], nn_val_rmse, test_rmse],
    'R2': [results[best_model_name]['val_r2'], nn_val_r2, test_r2]
}

x_pos = np.arange(len(models_comparison))

# MSE comparison plot.
axes[0].plot(x_pos, metrics_comparison['MSE'], marker='o', linestyle='-', linewidth=2, markersize=8, color='#2E86AB')
axes[0].set_xlabel('Model', fontweight='bold')
axes[0].set_ylabel('MSE', fontweight='bold')
axes[0].set_title('Mean Squared Error', fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(models_comparison)
axes[0].grid(True, alpha=0.3)

# RMSE comparison plot.
axes[1].plot(x_pos, metrics_comparison['RMSE'], marker='o', linestyle='-', linewidth=2, markersize=8, color='#A23B72')
axes[1].set_xlabel('Model', fontweight='bold')
axes[1].set_ylabel('RMSE', fontweight='bold')
axes[1].set_title('Root Mean Squared Error', fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(models_comparison)
axes[1].grid(True, alpha=0.3)

# R2 comparison plot.
axes[2].plot(x_pos, metrics_comparison['R2'], marker='o', linestyle='-', linewidth=2, markersize=8, color='#55A868')
axes[2].set_xlabel('Model', fontweight='bold')
axes[2].set_ylabel('R² Score', fontweight='bold')
axes[2].set_title('R² Score', fontweight='bold')
axes[2].set_xticks(x_pos)
axes[2].set_xticklabels(models_comparison)
axes[2].set_ylim(0, 1.05)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


# === CONCLUSIONS ===

print("\n" + "=" * 60)
print("CONCLUSIONS AND INSIGHTS")
print("=" * 60)

print("\n1. Model Performance Summary:")
print(f"   - Initial Best Traditional Model (Val Set): {best_model_name}, MSE: {results[best_model_name]['val_mse']:.4f}, R²: {results[best_model_name]['val_r2']:.4f}")
print(f"   - Neural Network (Val Set): MSE: {nn_val_mse:.4f}, R²: {nn_val_r2:.4f}")
print(f"   - Final Selected Model ({final_best_model_name}) (Held-out Test Set): MSE: {test_mse:.4f}, R²: {test_r2:.4f}")


print("\n2. Key Observations:")
# Compare initial best vs NN validation.
if results[best_model_name]['val_mse'] < nn_val_mse:
    print(f"   - Initial validation showed the traditional model ({best_model_name}) slightly outperformed the neural network.")
else:
    print(f"   - Initial validation showed the neural network slightly outperformed the traditional model ({best_model_name}).")

# Compare final test result to validation results.
print(f"   - The final model ({final_best_model_name}) performed similarly on the unseen test set, indicating good generalization.")


print("\n3. Overfitting Analysis (Initial models):")
# Overfitting checks from Part 1.
for name, metrics in results.items():
    train_mse = metrics['train_mse']
    val_mse = metrics['val_mse']
    if train_mse > 1e-9 and val_mse > train_mse and (val_mse - train_mse) / train_mse > overfitting_threshold_ratio:
         print(f"   - {name}: Showed signs of overfitting.")
    elif train_mse < 1e-9 and val_mse > overfitting_threshold_abs:
         print(f"   - {name}: Potential overfitting detected.")
    else:
        print(f"   - {name}: Appeared balanced.")


print("\n4. Ridge Hyperparameter Tuning Insight:")
print(f"   - Tuning Ridge on the professional split identified an optimal alpha of {best_alpha}.")
if 'Ridge' in final_best_model_name:
     print("   - Using this tuned alpha led to Ridge being selected as the final model.")
else:
    print(f"   - Ridge tuning was done, but {final_best_model_name} was chosen as the final best model based on validation performance.")


print("\n5. Recommendations:")
print(f"   - This analysis used synthetic data. Real-world data will require more feature engineering and exploration.")
print(f"   - The chosen final model ({final_best_model_name}) worked well for this dataset.")
print("   - For new real-world problems: repeat this process - load data, engineer features, explore models, and tune hyperparameters.")
print(f"   - The 'final_regression_model.pkl' and 'final_scaler.pkl' files are ready for predicting on new data.")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)

In [None]:

# IMPORTANT: RUN THIS CELL FIRST TO CREATE THE SYNTHETIC DATASET (df_synth)!


print("Generating the synthetic dataset with features and a target...")

# Number of samples
n_samples = 890 # Matching the size of the original dataset

# Generate features (e.g., 2 features)
np.random.seed(42) # for reproducibility
X_synth = 2 * np.random.rand(n_samples, 2)

# Generate target variable based on features with some noise
# Example: y = 4 + 3*x1 + 5*x2 + noise
y_synth = 4 + 3 * X_synth[:, 0] + 5 * X_synth[:, 1] + np.random.randn(n_samples, 1).flatten()

# Create a pandas DataFrame for consistency with the original code structure
df_synth = pd.DataFrame(X_synth, columns=['feature_1', 'feature_2'])
df_synth['target'] = y_synth

print(f"Synthetic dataset shape: {df_synth.shape}")
print(f"Synthetic dataset columns: {df_synth.columns.tolist()}")
print("\nFirst few rows of the synthetic dataset:")
display(df_synth.head())

# After this cell has successfully executed, you can then run the main regression analysis cell (90IBzyJSr8jF).