In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# If you're working with real housing data, use this approach:
def create_robust_income_categories(df, income_column='median_income'):
    """
    Create income categories that handle edge cases properly
    """
    # Remove rows with missing income data
    df_clean = df.dropna(subset=[income_column]).copy()
    
    # Check the actual range of your data
    print(f"Income range: {df_clean[income_column].min():.2f} to {df_clean[income_column].max():.2f}")
    
    # Create bins based on percentiles for more robust categorization
    percentiles = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
    bin_edges = df_clean[income_column].quantile(percentiles).values
    
    # Ensure unique bin edges (in case of duplicate values)
    bin_edges = np.unique(bin_edges)
    
    # Create labels
    labels = list(range(1, len(bin_edges)))
    
    print(f"Bin edges: {bin_edges}")
    print(f"Number of categories: {len(labels)}")
    
    # Create categories
    df_clean['income_cat'] = pd.cut(df_clean[income_column], 
                                   bins=bin_edges, 
                                   labels=labels, 
                                   include_lowest=True)
    
    # Final check for NaN values
    nan_count = df_clean['income_cat'].isna().sum()
    if nan_count > 0:
        print(f"Warning: {nan_count} NaN values found. Removing them.")
        df_clean = df_clean.dropna(subset=['income_cat'])
    
    return df_clean

# Example with synthetic data (you can replace this with your actual housing data)
np.random.seed(42)
n_samples = 1000

# Create more realistic housing data
housing_data = {
    'median_house_value': np.random.normal(200000, 100000, n_samples),
    'median_income': np.abs(np.random.normal(5, 2, n_samples)),  # Ensure positive values
    'housing_median_age': np.random.randint(1, 50, n_samples),
    'total_rooms': np.random.randint(1000, 8000, n_samples)
}

# Add a few missing values to simulate real data
housing_data['median_income'][np.random.choice(n_samples, 10)] = np.nan

housing = pd.DataFrame(housing_data)

print("Original dataset shape:", housing.shape)
print("Missing values per column:")
print(housing.isnull().sum())

# Clean the data and create categories
housing_clean = create_robust_income_categories(housing)

print(f"\nCleaned dataset shape: {housing_clean.shape}")
print("Income category distribution:")
print(housing_clean['income_cat'].value_counts().sort_index())

# Now use StratifiedShuffleSplit
print("\n" + "="*60)
print("STRATIFIED SHUFFLE SPLIT WITH MULTIPLE SPLITS")
print("="*60)

splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
rmse_scores = []

for i, (train_index, test_index) in enumerate(splitter.split(housing_clean, housing_clean['income_cat'])):
    train_set = housing_clean.iloc[train_index]
    test_set = housing_clean.iloc[test_index]
    
    # Verify stratification worked
    print(f"\nSplit {i+1}:")
    print(f"  Training set size: {len(train_set)}")
    print(f"  Test set size: {len(test_set)}")
    print(f"  Training income distribution: {train_set['income_cat'].value_counts(normalize=True).sort_index().round(3).to_dict()}")
    print(f"  Test income distribution: {test_set['income_cat'].value_counts(normalize=True).sort_index().round(3).to_dict()}")
    
    # Train a simple model
    feature_columns = ['median_income', 'housing_median_age', 'total_rooms']
    X_train = train_set[feature_columns]
    y_train = train_set['median_house_value']
    X_test = test_set[feature_columns]
    y_test = test_set['median_house_value']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    rmse_scores.append(rmse)
    
    print(f"  RMSE: {rmse:.2f}")

print(f"\n" + "="*60)
print("SUMMARY STATISTICS:")
print("="*60)
print(f"Mean RMSE: {np.mean(rmse_scores):.2f}")
print(f"Std RMSE:  {np.std(rmse_scores):.2f}")
print(f"Min RMSE:  {np.min(rmse_scores):.2f}")
print(f"Max RMSE:  {np.max(rmse_scores):.2f}")
print(f"95% Confidence Interval: [{np.mean(rmse_scores) - 1.96*np.std(rmse_scores):.2f}, {np.mean(rmse_scores) + 1.96*np.std(rmse_scores):.2f}]")

# Show why n_splits matters
print(f"\n" + "="*60)
print("WHY n_splits MATTERS:")
print("="*60)
print("Instead of saying 'My model has X error', you can say:")
print(f"'My model has {np.mean(rmse_scores):.0f} ± {np.std(rmse_scores):.0f} error'")
print("This gives you:")
print("  1. Confidence in your model's performance")
print("  2. Understanding of performance variability")
print("  3. Better comparison between different models")
print("  4. Detection of overfitting (high variance across splits)")

Original dataset shape: (1000, 4)
Missing values per column:
median_house_value     0
median_income         10
housing_median_age     0
total_rooms            0
dtype: int64
Income range: 0.00 to 11.39
Bin edges: [1.18857025e-03 3.51408600e+00 4.66126623e+00 5.64622930e+00
 6.85758054e+00 1.13862151e+01]
Number of categories: 5

Cleaned dataset shape: (990, 5)
Income category distribution:
income_cat
1    198
2    198
3    198
4    198
5    198
Name: count, dtype: int64

STRATIFIED SHUFFLE SPLIT WITH MULTIPLE SPLITS

Split 1:
  Training set size: 792
  Test set size: 198
  Training income distribution: {1: 0.199, 2: 0.201, 3: 0.199, 4: 0.199, 5: 0.201}
  Test income distribution: {1: 0.202, 2: 0.197, 3: 0.202, 4: 0.202, 5: 0.197}
  RMSE: 105834.34

Split 2:
  Training set size: 792
  Test set size: 198
  Training income distribution: {1: 0.199, 2: 0.201, 3: 0.201, 4: 0.199, 5: 0.199}
  Test income distribution: {1: 0.202, 2: 0.197, 3: 0.197, 4: 0.202, 5: 0.202}
  RMSE: 101115.86

Split