# Exploratory Data Analysis for California Housing Dataset

This notebook performs comprehensive EDA on the California Housing dataset to understand patterns, relationships, and prepare for linear regression modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sys
import os

# Add src directory to path
sys.path.append('../src')

# Import custom modules
from data_cleaning import load_data, handle_missing_values, remove_outliers, feature_engineering

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load and preprocess data
df = load_data('../data/housing.csv')
df = handle_missing_values(df)
numerical_cols = ['median_income', 'median_house_value', 'total_rooms', 'total_bedrooms']
df = remove_outliers(df, numerical_cols)
df = feature_engineering(df)

print(f"Dataset shape after preprocessing: {df.shape}")
df.head()

## 1. Univariate Analysis

Let's examine the distribution of individual variables.

In [None]:
# Numerical variables analysis
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
target_col = 'median_house_value'

print("Numerical Variables Summary:")
for col in numerical_cols:
    if col != target_col:
        print(f"\n{col}:")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Std: {df[col].std():.2f}")
        print(f"  Skewness: {df[col].skew():.2f}")

# Histograms
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    if i < 8:
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 2. Bivariate Analysis

Let's examine relationships between features and the target variable.

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix')
plt.show()

# Correlation with target
correlations = df[numerical_cols].corr()[target_col].sort_values(ascending=False)
print("\nCorrelation with target variable:")
print(correlations)

# Scatter plots with target
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

top_features = correlations.index[1:7]  # Exclude target itself

for i, col in enumerate(top_features):
    sns.scatterplot(x=df[col], y=df[target_col], ax=axes[i], alpha=0.6)
    axes[i].set_title(f'{col} vs {target_col}')

plt.tight_layout()
plt.show()

## 3. Multicollinearity Check

Check for multicollinearity using Variance Inflation Factor (VIF).

In [None]:
# Calculate VIF
def calculate_vif(df, features):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = features
    vif_data["VIF"] = [variance_inflation_factor(df[features].values, i) 
                        for i in range(len(features))]
    return vif_data

# Features for VIF calculation (excluding target)
features_for_vif = [col for col in numerical_cols if col != target_col]
vif_data = calculate_vif(df, features_for_vif)

print("Variance Inflation Factor (VIF) Analysis:")
print(vif_data.sort_values('VIF', ascending=False))

# Highlight high VIF features
high_vif = vif_data[vif_data['VIF'] > 5]
if not high_vif.empty:
    print("\nFeatures with high multicollinearity (VIF > 5):")
    print(high_vif)
else:
    print("\nNo features with high multicollinearity detected.")

# VIF visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='VIF', y='Feature', data=vif_data.sort_values('VIF', ascending=False))
plt.axvline(x=5, color='red', linestyle='--', label='VIF = 5 threshold')
plt.title('Variance Inflation Factor (VIF) by Feature')
plt.legend()
plt.show()

## 4. Feature Importance

Visualize the most important features based on correlation with target.

In [None]:
# Feature importance based on correlation
plt.figure(figsize=(10, 6))
feature_importance = correlations.abs().sort_values(ascending=False)[1:11]  # Top 10 features
sns.barplot(x=feature_importance.values, y=feature_importance.index)
plt.title('Top 10 Features by Correlation with Median House Value')
plt.xlabel('Absolute Correlation')
plt.show()

print("\nTop 5 most correlated features with target:")
for i, (feature, corr) in enumerate(correlations.items()):
    if i > 0 and i <= 5:  # Skip target itself
        print(f"{i}. {feature}: {corr:.3f}")

## 5. Summary and Insights

Key findings from the EDA:

In [None]:
print("=== EDA Summary ===")
print(f"\nDataset Overview:")
print(f"- Total samples: {len(df)}")
print(f"- Number of features: {len(df.columns) - 1}")  # Excluding target
print(f"- Target variable: {target_col}")

print(f"\nKey Correlations:")
top_corr = correlations.head()
for feature, corr in top_corr.items():
    print(f"- {feature}: {corr:.3f}")

print(f"\nMulticollinearity Issues:")
if not high_vif.empty:
    print(f"- {len(high_vif)} features have VIF > 5")
    print(f"- Highest VIF: {high_vif['Feature'].iloc[0]} ({high_vif['VIF'].iloc[0]:.2f})")
else:
    print("- No significant multicollinearity detected")

print(f"\nRecommendations for Modeling:")
print("- Focus on features with strong correlation to target")
print("- Consider feature selection to reduce multicollinearity")
print("- Apply feature scaling before linear regression")
print("- Consider polynomial features or interaction terms if needed")