# California Housing Dataset Exploratory Data Analysis

This notebook performs exploratory data analysis on the California Housing dataset. We'll examine:
1. Data loading and basic information
2. Missing values check
3. Feature distributions
4. Correlation analysis
5. Geographic distribution of house prices

In [None]:
# Data manipulation and numerical operations
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.datasets import fetch_california_housing

# Suppress warnings
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Set visualization style
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [10, 6]  # Set default figure size

## 1. Load and Examine Data
Let's load the California Housing dataset and examine its basic properties.

In [None]:
# Load the dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Rename columns for better readability
df.columns = ['Median_Income', 'House_Age', 'Average_Rooms', 'Average_Bedrooms', 
             'Population', 'Average_Occupancy', 'Latitude', 'Longitude', 'Median_House_Value']

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFeature Information:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

## 2. Check for Missing Values
Let's verify if there are any missing values in our dataset.

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values)

## 3. Feature Distributions
Let's examine the distribution of each feature using histograms.

In [None]:
# Create histograms for each feature
plt.figure(figsize=(15, 10))

for i, column in enumerate(df.columns, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data=df, x=column, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis
Let's analyze the correlations between features using a heatmap.

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Create correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Features')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

# Print strongest correlations with house value
correlations_with_price = correlation_matrix['Median_House_Value'].sort_values(ascending=False)
print("\nCorrelations with Median House Value:")
print(correlations_with_price)

## 5. Geographic Distribution
Finally, let's visualize the geographic distribution of house prices across California.

In [None]:
# Create scatter plot of house prices on a map
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['Longitude'], df['Latitude'], 
                     c=df['Median_House_Value'], 
                     cmap='viridis', 
                     alpha=0.4)
plt.colorbar(scatter, label='Median House Value')
plt.title('Geographic Distribution of House Prices')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.tight_layout()
plt.show()