# Data Preprocessing for Housing Price Prediction

This notebook demonstrates advanced data preprocessing techniques using Pandas and NumPy for the California Housing dataset.

**Author:** Your Name  
**Date:** April 18, 2025

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
import sys
import os

# Add the parent directory to the path to import from src
sys.path.append('..')
from src.data_processor import DataProcessor
from src.feature_engineering import AdvancedFeatureEngineer
from src.visualization import AdvancedVisualizer

# Set visualization styles
sns.set_style('whitegrid')
sns.set_context('notebook')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# For reproducibility
np.random.seed(42)

## 1. Load the Dataset

Let's load the California Housing dataset that we prepared in the previous notebook.

In [None]:
# Initialize the DataProcessor
data_processor = DataProcessor(random_state=42)

# Load the dataset
housing_df = data_processor.load_data('../data/housing.csv')

# Display the first few rows
housing_df.head()

## 2. Detect Feature Types and Analyze Missing Values

Now we'll use our DataProcessor to identify numerical and categorical features, and check for any missing values.

In [None]:
# Detect feature types
numerical_features, categorical_features = data_processor.detect_feature_types(housing_df)

# Analyze missing values
missing_info = data_processor.analyze_missing_values(housing_df)
missing_info

## 3. Handle Outliers

Next, we'll identify and handle outliers in the dataset using our DataProcessor.

In [None]:
# Detect outliers using boxplots
visualizer = AdvancedVisualizer()
visualizer.plot_boxplots(housing_df, features=numerical_features)

In [None]:
# Handle outliers using winsorization (capping at z-score of 3)
housing_df_clean = data_processor.handle_outliers(
    housing_df, 
    method='winsorize', 
    threshold=3.0
)

# Visualize the effect of outlier handling
for feature in ['AveOccup', 'Population', 'AveRooms']:
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    sns.boxplot(x=housing_df[feature])
    plt.title(f'Before Outlier Handling: {feature}')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x=housing_df_clean[feature])
    plt.title(f'After Outlier Handling: {feature}')
    
    plt.tight_layout()
    plt.show()

## 4. Feature Engineering

Now we'll apply advanced feature engineering techniques to create new features.

In [None]:
# Demonstrate our custom feature creation using DataProcessor

# Create interaction features
housing_df_with_interactions = data_processor.create_interaction_features(housing_df_clean)

# Create polynomial features
housing_df_with_poly = data_processor.create_polynomial_features(
    housing_df_with_interactions,
    features=['MedInc', 'AveRooms', 'HouseAge'],
    degree=2
)

# Create group statistics
# Binning latitude and longitude first to create groups
housing_df_with_poly['lat_bin'] = pd.qcut(housing_df_with_poly['Latitude'], q=10, labels=False)
housing_df_with_poly['lon_bin'] = pd.qcut(housing_df_with_poly['Longitude'], q=10, labels=False)
housing_df_with_poly['geo_bin'] = housing_df_with_poly['lat_bin'].astype(str) + '_' + housing_df_with_poly['lon_bin'].astype(str)

# Create group statistics based on geographic bins
housing_df_engineered = data_processor.create_group_statistics(
    housing_df_with_poly,
    group_col='geo_bin',
    agg_cols=['MedInc', 'HouseAge', 'AveRooms'],
    statistics=['mean', 'median']
)

# Drop the temporary binning columns
housing_df_engineered = housing_df_engineered.drop(columns=['lat_bin', 'lon_bin'])

# Display the shape of the engineered dataset
print(f"Original dataset shape: {housing_df.shape}")
print(f"Engineered dataset shape: {housing_df_engineered.shape}")

# Display some of the new features
new_features = [col for col in housing_df_engineered.columns if col not in housing_df.columns]
print(f"Number of new features created: {len(new_features)}")
print("Sample of new features:")
print(new_features[:10])

In [None]:
# Now use our AdvancedFeatureEngineer class for more feature engineering
feature_engineer = AdvancedFeatureEngineer(
    polynomial_degree=2,
    interaction_features=True,
    binning_features=True,
    cyclical_features=True,
    outlier_features=True,
    transformation_features=True
)

# Prepare X and y
X = housing_df_clean.drop(columns=['median_house_value'])
y = housing_df_clean['median_house_value']

# Fit and transform
feature_engineer.fit(X)
X_transformed = feature_engineer.transform(X)

# Display the shape of the transformed dataset
print(f"Original X shape: {X.shape}")
print(f"Transformed X shape: {X_transformed.shape}")

# Display some of the new features
new_features = [col for col in X_transformed.columns if col not in X.columns]
print(f"Number of new features created: {len(new_features)}")
print("Sample of new features:")
print(new_features[:10])

## 5. Feature Selection

With so many engineered features, we need to select the most important ones to avoid overfitting.

In [None]:
# Calculate feature importance using f_regression
feature_importance = data_processor.efficient_feature_importance(
    X_transformed, 
    y,
    method='f_regression'
)

# Display top 20 features
feature_importance.head(20)

In [None]:
# Visualize feature importance
visualizer.plot_feature_importance(
    feature_importance['Feature'].values,
    feature_importance['F-Value'].values,
    top_n=20,
    title='Top 20 Features by F-Value'
)

In [None]:
# Calculate feature importance using mutual information
feature_importance_mi = data_processor.efficient_feature_importance(
    X_transformed, 
    y,
    method='mutual_info'
)

# Display top 20 features
feature_importance_mi.head(20)

In [None]:
# Visualize mutual information feature importance
visualizer.plot_feature_importance(
    feature_importance_mi['Feature'].values,
    feature_importance_mi['Mutual Information'].values,
    top_n=20,
    title='Top 20 Features by Mutual Information'
)

## 6. Feature Selection and Dimensionality Reduction

Let's select important features based on our analysis and apply dimensionality reduction.

In [None]:
# Select top 30 features based on F-Value
top_features = feature_importance['Feature'].head(30).tolist()

# Create a new DataFrame with selected features
X_selected = X_transformed[top_features]

print(f"Original shape: {X_transformed.shape}")
print(f"Selected features shape: {X_selected.shape}")

In [None]:
# Apply dimensionality reduction with PCA
visualizer.plot_dimensionality_reduction(
    pd.concat([X_selected, pd.DataFrame({'median_house_value': y})], axis=1),
    target='median_house_value',
    method='pca',
    n_components=3,
    figsize=(14, 10)
)

In [None]:
# Apply dimensionality reduction with t-SNE for visualization
visualizer.plot_dimensionality_reduction(
    pd.concat([X_selected, pd.DataFrame({'median_house_value': y})], axis=1),
    target='median_house_value',
    method='tsne',
    n_components=2,
    figsize=(14, 10)
)

## 7. Preprocessing Pipeline Creation

Now we'll create a preprocessing pipeline for the data.

In [None]:
# Create the preprocessor pipeline
preprocessor = data_processor.create_preprocessor(
    impute_strategy='median',
    categorical_strategy='onehot',
    handle_outliers=True,
    knn_impute=False
)

# Display the preprocessing pipeline
print(preprocessor)

## 8. Train-Validation-Test Split

Finally, let's split the data into training, validation, and test sets.

In [None]:
# Split the data
X_train, X_val, X_test, y_train, y_val, y_test = data_processor.create_train_val_test_split(
    housing_df_clean, 
    target_col='median_house_value',
    test_size=0.2,
    val_size=0.25
)

In [None]:
# Prepare data for model training
# Select top features for all sets
X_train_selected = X_train[top_features]
X_val_selected = X_val[top_features]
X_test_selected = X_test[top_features]

# Save preprocessed datasets for the next notebook
os.makedirs('../data/processed', exist_ok=True)

# Save the training data
train_data = pd.concat([X_train_selected, pd.DataFrame({'median_house_value': y_train})], axis=1)
train_data.to_csv('../data/processed/train_data.csv', index=False)

# Save the validation data
val_data = pd.concat([X_val_selected, pd.DataFrame({'median_house_value': y_val})], axis=1)
val_data.to_csv('../data/processed/val_data.csv', index=False)

# Save the test data
test_data = pd.concat([X_test_selected, pd.DataFrame({'median_house_value': y_test})], axis=1)
test_data.to_csv('../data/processed/test_data.csv', index=False)

# Save the list of selected features
with open('../data/processed/selected_features.txt', 'w') as f:
    for feature in top_features:
        f.write(f"{feature}\n")

print("Preprocessed datasets saved successfully!")

## 9. Summary

In this notebook, we've demonstrated advanced data preprocessing techniques using Pandas and NumPy:

1. Loaded and analyzed the dataset
2. Detected and handled outliers using multiple methods
3. Created engineered features using various techniques:
   - Interaction features
   - Polynomial features
   - Group statistics
   - Transformations (log, sqrt, etc.)
   - Categorical encoding
4. Applied feature selection techniques
5. Created preprocessing pipeline
6. Split the data into training, validation, and test sets

The preprocessed data is now ready for model training in the next notebook.