In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
import sys

# Add src to path for imports
sys.path.append('../src')
from data_fetcher import SatelliteImageFetcher

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [None]:
# Configuration
CONFIG = {
    'target_col': 'price',
    'lat_col': 'lat',
    'lon_col': 'long',
    'image_size': 224,
    'seed': 42,
    'test_size': 0.2,
    'use_log_target': True,
}

# Paths - UPDATE THESE
DATA_DIR = '../data'
IMAGE_DIR = '../data/mapbox_images'
OUTPUT_DIR = '../outputs'
TRAIN_PATH = f'{DATA_DIR}/train.xlsx'
TEST_PATH = f'{DATA_DIR}/test.xlsx'

np.random.seed(CONFIG['seed'])

## 1. Load Data

In [None]:
train_df = pd.read_excel(TRAIN_PATH, engine='openpyxl')
test_df = pd.read_excel(TEST_PATH, engine='openpyxl')

# Clean column names
train_df.columns = [c.strip() for c in train_df.columns]
test_df.columns = [c.strip() for c in test_df.columns]

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
print("\nTrain columns:")
print(train_df.columns.tolist())

In [None]:
train_df.head()

In [None]:
train_df.info()

## 2. Data Exploration

In [None]:
train_df.describe()

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(train_df[CONFIG['target_col']], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Price Distribution')

axes[1].hist(np.log1p(train_df[CONFIG['target_col']]), bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[1].set_xlabel('Log Price')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Log Price Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Missing values
missing = train_df.isnull().sum()
missing_pct = (missing / len(train_df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percent': missing_pct})
missing_df[missing_df['Missing'] > 0].sort_values('Percent', ascending=False)

In [None]:
# Feature correlation with target
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
if CONFIG['target_col'] in numeric_cols:
    correlations = train_df[numeric_cols].corr()[CONFIG['target_col']].drop(CONFIG['target_col']).sort_values(key=abs, ascending=False)
    print("\nCorrelations with target:")
    print(correlations.head(10))

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
corr_matrix = train_df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Geographic Visualization

In [None]:
if CONFIG['lat_col'] in train_df.columns and CONFIG['lon_col'] in train_df.columns:
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(
        train_df[CONFIG['lon_col']], 
        train_df[CONFIG['lat_col']], 
        c=np.log1p(train_df[CONFIG['target_col']]),
        cmap='viridis',
        alpha=0.6,
        s=10
    )
    plt.colorbar(scatter, label='Log Price')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Property Locations Colored by Price')
    plt.tight_layout()
    plt.show()

## 4. Image Data Analysis

In [None]:
# Count available images
if os.path.exists(IMAGE_DIR):
    image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.png')]
    print(f"Total images: {len(image_files)}")
    print(f"Training samples: {len(train_df)}")
    print(f"Coverage: {len(image_files)/len(train_df)*100:.1f}%")
else:
    print(f"Image directory not found: {IMAGE_DIR}")

In [None]:
# Display sample images
if os.path.exists(IMAGE_DIR) and len(image_files) > 0:
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    sample_images = np.random.choice(image_files, min(8, len(image_files)), replace=False)
    
    for ax, img_file in zip(axes.flatten(), sample_images):
        img_path = os.path.join(IMAGE_DIR, img_file)
        img = Image.open(img_path)
        ax.imshow(img)
        ax.set_title(img_file[:15])
        ax.axis('off')
    
    plt.suptitle('Sample Satellite Images', fontsize=14)
    plt.tight_layout()
    plt.show()

## 5. Feature Engineering

In [None]:
# Identify feature columns
exclude_cols = {CONFIG['target_col'], 'date', 'id', CONFIG['lat_col'], CONFIG['lon_col']}
feature_cols = [c for c in train_df.columns if c not in exclude_cols and train_df[c].dtype in ['int64', 'float64']]

print(f"Feature columns ({len(feature_cols)}):")
print(feature_cols)

In [None]:
# Feature statistics
train_df[feature_cols].describe()

## 6. Train/Validation Split

In [None]:
train_data, val_data = train_test_split(train_df, test_size=CONFIG['test_size'], random_state=CONFIG['seed'])

train_image_ids = train_data.index.tolist()
val_image_ids = val_data.index.tolist()

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

## 7. Preprocessing Pipeline

In [None]:
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train = preprocessor.fit_transform(train_data[feature_cols])
X_val = preprocessor.transform(val_data[feature_cols])
X_test = preprocessor.transform(test_df[feature_cols])

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Prepare target variable
if CONFIG['use_log_target']:
    y_train = np.log1p(train_data[CONFIG['target_col']].values)
    y_val = np.log1p(val_data[CONFIG['target_col']].values)
    y_train_original = train_data[CONFIG['target_col']].values
    y_val_original = val_data[CONFIG['target_col']].values
    print("Using log-transformed target")
else:
    y_train = train_data[CONFIG['target_col']].values
    y_val = val_data[CONFIG['target_col']].values
    y_train_original = y_train
    y_val_original = y_val
    print("Using original target")

print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")

## 8. Save Preprocessed Data

In [None]:
import joblib

# Save preprocessor
os.makedirs(OUTPUT_DIR, exist_ok=True)
joblib.dump(preprocessor, f'{OUTPUT_DIR}/preprocessor.joblib')
print(f"Saved preprocessor to {OUTPUT_DIR}/preprocessor.joblib")

# Save feature columns
joblib.dump(feature_cols, f'{OUTPUT_DIR}/feature_cols.joblib')
print(f"Saved feature columns to {OUTPUT_DIR}/feature_cols.joblib")

In [None]:
# Save processed arrays
np.save(f'{OUTPUT_DIR}/X_train.npy', X_train)
np.save(f'{OUTPUT_DIR}/X_val.npy', X_val)
np.save(f'{OUTPUT_DIR}/X_test.npy', X_test)
np.save(f'{OUTPUT_DIR}/y_train.npy', y_train)
np.save(f'{OUTPUT_DIR}/y_val.npy', y_val)
np.save(f'{OUTPUT_DIR}/y_train_original.npy', y_train_original)
np.save(f'{OUTPUT_DIR}/y_val_original.npy', y_val_original)
np.save(f'{OUTPUT_DIR}/train_image_ids.npy', np.array(train_image_ids))
np.save(f'{OUTPUT_DIR}/val_image_ids.npy', np.array(val_image_ids))

print(f"\nSaved all preprocessed data to {OUTPUT_DIR}/")

In [None]:
print("\n" + "="*50)
print("PREPROCESSING SUMMARY")
print("="*50)
print(f"Features: {len(feature_cols)}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_df)}")
print(f"Log target: {CONFIG['use_log_target']}")
print("\n‚úÖ Preprocessing complete!")

## 1. Data Loading & Initial Exploration

In [None]:
# Load datasets
train_df = pd.read_excel('../data/train.xlsx', engine='openpyxl')
test_df = pd.read_excel('../data/test.xlsx', engine='openpyxl')

# Clean column names
train_df.columns = [c.strip() for c in train_df.columns]
test_df.columns = [c.strip() for c in test_df.columns]

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nTraining columns: {list(train_df.columns)}")

In [None]:
# Basic info
print("=" * 50)
print("TRAINING DATA INFO")
print("=" * 50)
train_df.info()

In [None]:
# First few rows
train_df.head()

In [None]:
# Statistical summary
train_df.describe()

## 2. Missing Values Analysis

In [None]:
# Check missing values
missing_train = train_df.isnull().sum()
missing_test = test_df.isnull().sum()

missing_df = pd.DataFrame({
    'Train Missing': missing_train,
    'Train %': (missing_train / len(train_df) * 100).round(2),
    'Test Missing': missing_test,
    'Test %': (missing_test / len(test_df) * 100).round(2)
})

missing_df[missing_df['Train Missing'] > 0].sort_values('Train Missing', ascending=False)

In [None]:
# Visualize missing values
fig, ax = plt.subplots(figsize=(12, 6))
missing_pct = (train_df.isnull().sum() / len(train_df) * 100).sort_values(ascending=True)
missing_pct[missing_pct > 0].plot(kind='barh', ax=ax, color='coral')
ax.set_xlabel('Missing Percentage (%)')
ax.set_title('Missing Values in Training Data')
plt.tight_layout()
plt.show()

## 3. Target Variable Analysis (Price)

In [None]:
# Price distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Histogram
axes[0].hist(train_df['price'], bins=50, color='steelblue', edgecolor='white')
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Price Distribution')
axes[0].axvline(train_df['price'].median(), color='red', linestyle='--', label=f'Median: ${train_df["price"].median():,.0f}')
axes[0].legend()

# Log-transformed histogram
axes[1].hist(np.log1p(train_df['price']), bins=50, color='seagreen', edgecolor='white')
axes[1].set_xlabel('Log(Price)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Log-Transformed Price Distribution')

# Box plot
axes[2].boxplot(train_df['price'], vert=True)
axes[2].set_ylabel('Price ($)')
axes[2].set_title('Price Box Plot')

plt.tight_layout()
plt.savefig('../outputs/price_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Price statistics
print("\nüìä Price Statistics:")
print(f"   Mean:   ${train_df['price'].mean():,.2f}")
print(f"   Median: ${train_df['price'].median():,.2f}")
print(f"   Std:    ${train_df['price'].std():,.2f}")
print(f"   Min:    ${train_df['price'].min():,.2f}")
print(f"   Max:    ${train_df['price'].max():,.2f}")

## 4. Feature Correlations

In [None]:
# Select numeric columns for correlation
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

In [None]:
# Correlation with price
if 'price' in train_df.columns:
    correlations = train_df[numeric_cols].corr()['price'].drop('price').sort_values(ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = ['green' if x > 0 else 'red' for x in correlations.values]
    correlations.plot(kind='barh', ax=ax, color=colors)
    ax.set_xlabel('Correlation with Price')
    ax.set_title('Feature Correlations with Property Price')
    ax.axvline(x=0, color='black', linewidth=0.5)
    plt.tight_layout()
    plt.savefig('../outputs/price_correlations.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nüîù Top 5 Positive Correlations:")
    print(correlations.head())
    print("\nüîª Top 5 Negative Correlations:")
    print(correlations.tail())

In [None]:
# Correlation heatmap
fig, ax = plt.subplots(figsize=(14, 12))
corr_matrix = train_df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', 
            center=0, ax=ax, square=True, linewidths=0.5,
            annot_kws={'size': 8})
ax.set_title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../outputs/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Geospatial Analysis

In [None]:
# Check for lat/long columns
lat_col = 'lat'
lon_col = 'long'

if lat_col in train_df.columns and lon_col in train_df.columns:
    print(f"Latitude range: {train_df[lat_col].min():.4f} to {train_df[lat_col].max():.4f}")
    print(f"Longitude range: {train_df[lon_col].min():.4f} to {train_df[lon_col].max():.4f}")
    
    # Geographic distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Scatter plot of locations
    scatter = axes[0].scatter(train_df[lon_col], train_df[lat_col], 
                              c=np.log1p(train_df['price']), cmap='viridis', 
                              alpha=0.5, s=10)
    axes[0].set_xlabel('Longitude')
    axes[0].set_ylabel('Latitude')
    axes[0].set_title('Property Locations (colored by log price)')
    plt.colorbar(scatter, ax=axes[0], label='Log(Price)')
    
    # 2D histogram / density
    h = axes[1].hist2d(train_df[lon_col], train_df[lat_col], bins=50, cmap='YlOrRd')
    axes[1].set_xlabel('Longitude')
    axes[1].set_ylabel('Latitude')
    axes[1].set_title('Property Density Heatmap')
    plt.colorbar(h[3], ax=axes[1], label='Count')
    
    plt.tight_layout()
    plt.savefig('../outputs/geospatial_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("Latitude/Longitude columns not found!")

In [None]:
# Price by geographic region (using grid)
if lat_col in train_df.columns and lon_col in train_df.columns:
    # Create geographic bins
    train_df['lat_bin'] = pd.cut(train_df[lat_col], bins=10, labels=False)
    train_df['lon_bin'] = pd.cut(train_df[lon_col], bins=10, labels=False)
    
    # Average price by grid cell
    price_grid = train_df.groupby(['lat_bin', 'lon_bin'])['price'].mean().unstack()
    
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(price_grid, cmap='YlGnBu', ax=ax, annot=True, fmt='.0f')
    ax.set_xlabel('Longitude Bin')
    ax.set_ylabel('Latitude Bin')
    ax.set_title('Average Property Price by Geographic Grid')
    plt.tight_layout()
    plt.savefig('../outputs/price_by_location.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Clean up temp columns
    train_df.drop(['lat_bin', 'lon_bin'], axis=1, inplace=True)

## 6. Satellite Image Exploration

In [None]:
# Image directory path (pre-downloaded images)
IMAGE_DIR = '/Users/ruthwik/Downloads/mapbox_images'

# Check for downloaded images
if os.path.exists(IMAGE_DIR):
    cached_images = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.png')]
    print(f"Found {len(cached_images)} satellite images in {IMAGE_DIR}")
    print(f"Sample images: {sorted(cached_images)[:5]}")
else:
    print(f"Image directory not found: {IMAGE_DIR}")

# Auto-detect image naming convention
if 'id' in train_df.columns:

    test_id = train_df.iloc[0]['id']    return os.path.join(IMAGE_DIR, f'img_{row_or_idx}.png')

    test_by_id = os.path.exists(os.path.join(IMAGE_DIR, f'img_{int(test_id)}.png'))        return os.path.join(IMAGE_DIR, f'img_{int(row_or_idx)}.png')

else:    elif USE_ID_FOR_IMAGES and isinstance(row_or_idx, (int, float)):

    test_by_id = False        return os.path.join(IMAGE_DIR, f'img_{int(row_or_idx["id"])}.png')

test_by_idx = os.path.exists(os.path.join(IMAGE_DIR, 'img_0.png'))    if USE_ID_FOR_IMAGES and isinstance(row_or_idx, pd.Series) and 'id' in row_or_idx.index:

    """Get image path - use ID column if available, otherwise row index."""

USE_ID_FOR_IMAGES = test_by_iddef get_image_path(row_or_idx):

print(f"\n\u2705 Images by ID: {test_by_id}, by index: {test_by_idx}")
print(f"\u27a1\ufe0f Using {'ID column' if USE_ID_FOR_IMAGES else 'row index'} for image lookup")

In [None]:
# Display sample satellite images with their prices
def show_sample_images(df, n_samples=9, figsize=(15, 15)):
    """Display satellite images for random samples with price labels."""
    # Get random sample indices
    sample_indices = np.random.choice(len(df), size=min(n_samples, len(df)), replace=False)
    
    n_cols = 3
    n_rows = (len(sample_indices) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten()
    
    for plot_idx, data_idx in enumerate(sample_indices):
        row = df.iloc[data_idx]
        lat, lon = row.get(lat_col, 0), row.get(lon_col, 0)
        price = row.get('price', 'N/A')
        
        # Get image by ID (not row index!)
        if USE_ID_FOR_IMAGES and 'id' in df.columns:
            img_id = row['id']
            img_path = os.path.join(IMAGE_DIR, f'img_{int(img_id)}.png')
        else:
            img_path = os.path.join(IMAGE_DIR, f'img_{data_idx}.png')
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axes[plot_idx].imshow(img)
        else:
            # Show placeholder
            axes[plot_idx].text(0.5, 0.5, f'Image not found\n{os.path.basename(img_path)}', 
                          ha='center', va='center', fontsize=10)
            axes[plot_idx].set_facecolor('lightgray')
        
        if isinstance(price, (int, float)):
            axes[plot_idx].set_title(f'Price: ${price:,.0f}\n({lat:.4f}, {lon:.4f})', fontsize=10)
        else:
            axes[plot_idx].set_title(f'({lat:.4f}, {lon:.4f})', fontsize=10)
        axes[plot_idx].axis('off')
    
    # Hide empty subplots
    for idx in range(len(sample_indices), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Sample Satellite Images with Property Prices', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig('../outputs/sample_satellite_images.png', dpi=150, bbox_inches='tight')

    plt.show()show_sample_images(train_df, n_samples=9)

np.random.seed(42)

In [None]:
# Compare high vs low price properties - FIXED for ID-based image lookup
def compare_price_extremes(df, n_each=4):
    """Compare satellite images of highest vs lowest priced properties."""
    valid_df = df.dropna(subset=[lat_col, lon_col, 'price'])
    
    # Get rows of highest and lowest priced properties
    high_rows = valid_df.nlargest(n_each, 'price')
    low_rows = valid_df.nsmallest(n_each, 'price')
    
    fig, axes = plt.subplots(2, n_each, figsize=(4*n_each, 8))
    
    # High price properties
    for plot_idx, (_, row) in enumerate(high_rows.iterrows()):
        # Get image by ID (not row index!)
        if USE_ID_FOR_IMAGES and 'id' in df.columns:
            img_path = os.path.join(IMAGE_DIR, f'img_{int(row["id"])}.png')
        else:
            img_path = os.path.join(IMAGE_DIR, f'img_{plot_idx}.png')
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axes[0, plot_idx].imshow(img)
        else:
            axes[0, plot_idx].text(0.5, 0.5, 'N/A', ha='center', va='center')
            axes[0, plot_idx].set_facecolor('lightgray')
        
        axes[0, plot_idx].set_title(f'${row["price"]:,.0f}', fontsize=11, color='green')
        axes[0, plot_idx].axis('off')
    
    # Low price properties  
    for plot_idx, (_, row) in enumerate(low_rows.iterrows()):
        # Get image by ID (not row index!)
        if USE_ID_FOR_IMAGES and 'id' in df.columns:
            img_path = os.path.join(IMAGE_DIR, f'img_{int(row["id"])}.png')
        else:
            img_path = os.path.join(IMAGE_DIR, f'img_{plot_idx}.png')
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axes[1, plot_idx].imshow(img)
        else:
            axes[1, plot_idx].text(0.5, 0.5, 'N/A', ha='center', va='center')
            axes[1, plot_idx].set_facecolor('lightgray')
        
        axes[1, plot_idx].set_title(f'${row["price"]:,.0f}', fontsize=11, color='red')
        axes[1, plot_idx].axis('off')
    
    axes[0, 0].set_ylabel('HIGH\nPRICE', fontsize=12, rotation=0, ha='right', va='center')
    axes[1, 0].set_ylabel('LOW\nPRICE', fontsize=12, rotation=0, ha='right', va='center')
    
    plt.suptitle('Satellite Images: Highest vs Lowest Priced Properties', fontsize=14)
    plt.tight_layout()
    plt.savefig('../outputs/high_vs_low_price_images.png', dpi=150, bbox_inches='tight')
    plt.show()

compare_price_extremes(train_df)

## 7. Feature Engineering

In [None]:
def engineer_features(df):
    """Create engineered features from existing columns."""
    df = df.copy()
    
    # Example features (adjust based on your actual columns)
    if 'sqft_living' in df.columns and 'sqft_lot' in df.columns:
        df['living_lot_ratio'] = df['sqft_living'] / (df['sqft_lot'] + 1)
    
    if 'sqft_living' in df.columns and 'bedrooms' in df.columns:
        df['sqft_per_bedroom'] = df['sqft_living'] / (df['bedrooms'] + 1)
    
    if 'bathrooms' in df.columns and 'bedrooms' in df.columns:
        df['bath_bed_ratio'] = df['bathrooms'] / (df['bedrooms'] + 1)
    
    if 'yr_built' in df.columns:
        df['age'] = 2026 - df['yr_built']
    
    if 'yr_renovated' in df.columns and 'yr_built' in df.columns:
        df['is_renovated'] = (df['yr_renovated'] > 0).astype(int)
        df['years_since_renovation'] = np.where(
            df['yr_renovated'] > 0,
            2026 - df['yr_renovated'],
            2026 - df['yr_built']
        )
    
    if 'grade' in df.columns and 'condition' in df.columns:
        df['quality_score'] = df['grade'] * df['condition']
    
    if 'sqft_above' in df.columns and 'sqft_basement' in df.columns:
        df['has_basement'] = (df['sqft_basement'] > 0).astype(int)
        df['basement_ratio'] = df['sqft_basement'] / (df['sqft_above'] + 1)
    
    return df

# Apply feature engineering
train_featured = engineer_features(train_df)
test_featured = engineer_features(test_df)

# Show new columns
new_cols = set(train_featured.columns) - set(train_df.columns)
print(f"New engineered features: {new_cols}")

if new_cols:
    print("\nEngineered features statistics:")
    display(train_featured[list(new_cols)].describe())

## 8. Data Quality Summary

In [None]:
# Final data quality summary
print("=" * 60)
print("üìã DATA QUALITY SUMMARY")
print("=" * 60)

print(f"\nüìä Dataset Sizes:")
print(f"   Training samples: {len(train_df):,}")
print(f"   Test samples: {len(test_df):,}")
print(f"   Features: {len(train_df.columns) - 1}")

print(f"\nüí∞ Target Variable (Price):")
print(f"   Range: ${train_df['price'].min():,.0f} - ${train_df['price'].max():,.0f}")
print(f"   Mean: ${train_df['price'].mean():,.0f}")
print(f"   Median: ${train_df['price'].median():,.0f}")

print(f"\nüó∫Ô∏è Geographic Coverage:")
print(f"   Latitude: {train_df[lat_col].min():.4f} to {train_df[lat_col].max():.4f}")
print(f"   Longitude: {train_df[lon_col].min():.4f} to {train_df[lon_col].max():.4f}")

print(f"\nüñºÔ∏è Satellite Images:")
if os.path.exists(cache_dir):
    n_cached = len([f for f in os.listdir(cache_dir) if f.endswith('.png')])
    print(f"   Cached images: {n_cached}")
else:
    print(f"   Cached images: 0 (run data_fetcher.py)")

print(f"\n‚ö†Ô∏è Missing Values:")
missing_total = train_df.isnull().sum().sum()
print(f"   Total missing values: {missing_total}")
print(f"   Columns with missing data: {(train_df.isnull().sum() > 0).sum()}")

In [None]:
# Save processed data for training
train_featured.to_pickle('../data/train_processed.pkl')
test_featured.to_pickle('../data/test_processed.pkl')
print("‚úÖ Processed data saved to data/train_processed.pkl and data/test_processed.pkl")

---
## üìù Key Insights

Document your key findings here after running the analysis:

1. **Price Distribution**: [Your observation]
2. **Top Correlated Features**: [Your observation]
3. **Geographic Patterns**: [Your observation]
4. **Visual Features**: [Your observation about satellite images]