# Data Integration - POI and VIIRS Spatial Analysis

This notebook demonstrates the spatial integration of preprocessed Foursquare POI data with VIIRS nighttime lights data for comprehensive urban analysis.

## Objectives
1. Integrate POI and VIIRS datasets through spatial joins
2. Extract luminosity values at POI locations
3. Perform clustering analysis
4. Identify anomalous patterns
5. Generate integrated dataset for analysis


In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import rasterio
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Import custom modules
from analysis.data_integration import DataIntegrator

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load Preprocessed Data

Let's load the preprocessed POI and VIIRS data from the previous step.

In [None]:
# Initialize the data integrator
integrator = DataIntegrator(data_dir="../data")

print("Data integrator initialized")
print(f"Data directory: {integrator.data_dir}")
print(f"Processed data directory: {integrator.processed_dir}")

In [None]:
# Load POI data
print("Loading POI data...")
poi_gdf = integrator.load_poi_data()

print(f"Loaded {len(poi_gdf)} POIs")
print(f"Columns: {list(poi_gdf.columns)}")
print(f"CRS: {poi_gdf.crs}")

# Display basic info
display(poi_gdf.head(3))

In [None]:
# Load VIIRS data
print("Loading VIIRS data...")
viirs_data, viirs_transform, viirs_crs = integrator.load_viirs_data()

print(f"VIIRS data shape: {viirs_data.shape}")
print(f"VIIRS CRS: {viirs_crs}")
print(f"VIIRS transform: {viirs_transform}")
print(f"Valid pixels: {np.sum(~viirs_data.mask):,}")
print(f"Luminosity range: {viirs_data.min():.2f} to {viirs_data.max():.2f}")

## 2. Spatial Integration - Extract Luminosity to POIs

Now we'll extract VIIRS luminosity values at each POI location.

In [None]:
# Extract luminosity values to POI locations
print("Extracting VIIRS luminosity values to POI locations...")
integrated_gdf = integrator.extract_luminosity_to_pois(poi_gdf)

print(f"Integration complete! Added luminosity data to {len(integrated_gdf)} POIs")

# Check the new column
if 'viirs_luminosity' in integrated_gdf.columns:
    print(f"\nLuminosity statistics:")
    lum_stats = integrated_gdf['viirs_luminosity'].describe()
    print(lum_stats)
    
    # Check for POIs with zero luminosity
    zero_lum = (integrated_gdf['viirs_luminosity'] == 0).sum()
    print(f"\nPOIs with zero luminosity: {zero_lum} ({zero_lum/len(integrated_gdf)*100:.1f}%)")

# Display sample with luminosity data
display(integrated_gdf[['name', 'category_group', 'viirs_luminosity']].head())

In [None]:
# Visualize the integration results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Spatial distribution colored by luminosity
ax1 = axes[0, 0]
scatter = ax1.scatter(integrated_gdf.geometry.x, integrated_gdf.geometry.y, 
                     c=integrated_gdf['viirs_luminosity'], cmap='viridis', 
                     s=30, alpha=0.7, edgecolors='white', linewidth=0.5)
plt.colorbar(scatter, ax=ax1, label='VIIRS Luminosity')
ax1.set_title('POI Locations colored by Luminosity')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
ax1.grid(True, alpha=0.3)

# 2. Luminosity distribution by category
ax2 = axes[0, 1]
integrated_gdf.boxplot(column='viirs_luminosity', by='category_group', ax=ax2)
ax2.set_title('Luminosity Distribution by Category')
ax2.set_xlabel('Category')
ax2.set_ylabel('VIIRS Luminosity')
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)

# 3. Luminosity histogram
ax3 = axes[1, 0]
ax3.hist(integrated_gdf['viirs_luminosity'], bins=30, alpha=0.7, 
         color='skyblue', edgecolor='black')
ax3.set_xlabel('VIIRS Luminosity')
ax3.set_ylabel('Frequency')
ax3.set_title('Distribution of Luminosity Values')
ax3.grid(True, alpha=0.3)

# 4. Category vs mean luminosity
ax4 = axes[1, 1]
mean_lum_by_category = integrated_gdf.groupby('category_group')['viirs_luminosity'].mean().sort_values(ascending=False)
bars = ax4.bar(range(len(mean_lum_by_category)), mean_lum_by_category.values)
ax4.set_xticks(range(len(mean_lum_by_category)))
ax4.set_xticklabels(mean_lum_by_category.index, rotation=45, ha='right')
ax4.set_ylabel('Mean VIIRS Luminosity')
ax4.set_title('Mean Luminosity by Category')
ax4.grid(True, alpha=0.3)

# Add values on bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{height:.1f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 3. Clustering Analysis

Let's perform clustering analysis based on POI locations and luminosity values.

In [None]:
# Perform clustering analysis
print("Performing clustering analysis...")
clustered_gdf = integrator.identify_poi_luminosity_clusters(integrated_gdf, n_clusters=5)

print(f"Clustering complete! Added cluster assignments to {len(clustered_gdf)} POIs")

# Display cluster statistics
cluster_stats = clustered_gdf.groupby('cluster').agg({
    'viirs_luminosity': ['count', 'mean', 'std'],
    'category_group': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Mixed'
}).round(2)

print("\nCluster Statistics:")
print(cluster_stats)

# Display sample with cluster assignments
display(clustered_gdf[['name', 'category_group', 'viirs_luminosity', 'cluster']].head(8))

In [None]:
# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Define colors for clusters
cluster_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']

# 1. Spatial distribution of clusters
ax1 = axes[0, 0]
for i, cluster_id in enumerate(sorted(clustered_gdf['cluster'].unique())):
    cluster_data = clustered_gdf[clustered_gdf['cluster'] == cluster_id]
    ax1.scatter(cluster_data.geometry.x, cluster_data.geometry.y, 
               c=cluster_colors[i], label=f'Cluster {cluster_id}', 
               s=40, alpha=0.7, edgecolors='white', linewidth=0.5)

ax1.set_title('Spatial Distribution of Clusters')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Cluster sizes
ax2 = axes[0, 1]
cluster_counts = clustered_gdf['cluster'].value_counts().sort_index()
bars = ax2.bar(cluster_counts.index, cluster_counts.values, 
               color=[cluster_colors[i] for i in range(len(cluster_counts))])
ax2.set_xlabel('Cluster ID')
ax2.set_ylabel('Number of POIs')
ax2.set_title('Cluster Sizes')
ax2.set_xticks(cluster_counts.index)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{int(height)}', ha='center', va='bottom')

# 3. Luminosity by cluster
ax3 = axes[1, 0]
clustered_gdf.boxplot(column='viirs_luminosity', by='cluster', ax=ax3)
ax3.set_title('Luminosity Distribution by Cluster')
ax3.set_xlabel('Cluster ID')
ax3.set_ylabel('VIIRS Luminosity')

# 4. Cluster characteristics (2D scatter)
ax4 = axes[1, 1]
scatter = ax4.scatter(clustered_gdf.geometry.x, clustered_gdf['viirs_luminosity'], 
                     c=clustered_gdf['cluster'], cmap='tab10', s=40, alpha=0.7)
ax4.set_xlabel('Longitude')
ax4.set_ylabel('VIIRS Luminosity')
ax4.set_title('Clusters in Feature Space (Longitude vs Luminosity)')
plt.colorbar(scatter, ax=ax4, label='Cluster ID')

plt.tight_layout()
plt.show()

## 4. Anomaly Detection

Let's identify areas with unusual POI-luminosity relationships.

In [None]:
# Identify anomalous areas
print("Identifying anomalous POI-luminosity patterns...")
final_gdf = integrator.find_anomalous_areas(clustered_gdf)

print(f"Anomaly detection complete!")

# Display anomaly statistics
anomaly_counts = final_gdf['anomaly_type'].value_counts()
print("\nAnomaly Type Distribution:")
for anomaly_type, count in anomaly_counts.items():
    percentage = (count / len(final_gdf)) * 100
    print(f"{anomaly_type}: {count} ({percentage:.1f}%)")

# Display sample anomalies
print("\nSample Anomalous POIs:")
anomalous_pois = final_gdf[final_gdf['anomaly_type'] != 'Normal']
if len(anomalous_pois) > 0:
    display(anomalous_pois[['name', 'category_group', 'viirs_luminosity', 
                           'cluster', 'anomaly_type']].head())
else:
    print("No anomalies detected with current thresholds.")

In [None]:
# Visualize anomalies
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Define colors for anomaly types
anomaly_colors = {
    'Normal': '#28a745',
    'High Light, Non-Commercial': '#ffc107', 
    'Low Light, Commercial': '#dc3545'
}

# 1. Spatial distribution of anomalies
ax1 = axes[0, 0]
for anomaly_type in final_gdf['anomaly_type'].unique():
    anomaly_data = final_gdf[final_gdf['anomaly_type'] == anomaly_type]
    color = anomaly_colors.get(anomaly_type, '#888888')
    ax1.scatter(anomaly_data.geometry.x, anomaly_data.geometry.y,
               c=color, label=f'{anomaly_type} ({len(anomaly_data)})',
               s=40, alpha=0.7, edgecolors='white', linewidth=0.5)

ax1.set_title('Spatial Distribution of Anomalies')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Anomaly type distribution pie chart
ax2 = axes[0, 1]
colors_list = [anomaly_colors.get(atype, '#888888') for atype in anomaly_counts.index]
wedges, texts, autotexts = ax2.pie(anomaly_counts.values, labels=anomaly_counts.index, 
                                  autopct='%1.1f%%', colors=colors_list)
ax2.set_title('Anomaly Type Distribution')

# 3. Luminosity vs category for anomalies
ax3 = axes[1, 0]
for anomaly_type in final_gdf['anomaly_type'].unique():
    anomaly_data = final_gdf[final_gdf['anomaly_type'] == anomaly_type]
    color = anomaly_colors.get(anomaly_type, '#888888')
    ax3.scatter(anomaly_data['category_group'].astype('category').cat.codes, 
               anomaly_data['viirs_luminosity'],
               c=color, label=anomaly_type, s=30, alpha=0.7)

ax3.set_xlabel('Category (encoded)')
ax3.set_ylabel('VIIRS Luminosity')
ax3.set_title('Anomalies: Category vs Luminosity')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Anomaly characteristics by cluster
ax4 = axes[1, 1]
anomaly_cluster_crosstab = pd.crosstab(final_gdf['cluster'], final_gdf['anomaly_type'])
anomaly_cluster_crosstab.plot(kind='bar', ax=ax4, stacked=True, 
                             color=[anomaly_colors.get(col, '#888888') 
                                   for col in anomaly_cluster_crosstab.columns])
ax4.set_xlabel('Cluster ID')
ax4.set_ylabel('Number of POIs')
ax4.set_title('Anomaly Types by Cluster')
ax4.legend(title='Anomaly Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.setp(ax4.xaxis.get_majorticklabels(), rotation=0)

plt.tight_layout()
plt.show()

## 5. Comprehensive Integration Analysis

Let's calculate comprehensive spatial statistics and save the integrated dataset.

In [None]:
# Calculate comprehensive spatial statistics
print("Calculating comprehensive spatial statistics...")
spatial_stats = integrator.calculate_spatial_statistics(final_gdf)

print("\nSpatial Statistics Summary:")
print("=" * 40)

# Basic statistics
print(f"Total POIs: {spatial_stats['total_pois']:,}")
print(f"Categories: {spatial_stats['categories']}")

# Luminosity statistics
if 'luminosity' in spatial_stats:
    lum_stats = spatial_stats['luminosity']
    print(f"\nLuminosity Statistics:")
    print(f"  Range: {lum_stats['min']:.2f} to {lum_stats['max']:.2f}")
    print(f"  Mean: {lum_stats['mean']:.2f}")
    print(f"  Median: {lum_stats['median']:.2f}")
    print(f"  Std Dev: {lum_stats['std']:.2f}")

# Category-wise statistics
if 'category_luminosity' in spatial_stats:
    print(f"\nCategory-wise Luminosity:")
    for category, stats in spatial_stats['category_luminosity'].items():
        print(f"  {category}:")
        print(f"    Count: {stats['count']}")
        print(f"    Mean luminosity: {stats['mean_luminosity']:.2f}")
        print(f"    Median luminosity: {stats['median_luminosity']:.2f}")

# Spatial extent
if 'spatial_extent' in spatial_stats:
    extent = spatial_stats['spatial_extent']
    print(f"\nSpatial Extent:")
    print(f"  Longitude: {extent['minx']:.6f} to {extent['maxx']:.6f}")
    print(f"  Latitude: {extent['miny']:.6f} to {extent['maxy']:.6f}")

In [None]:
# Create comprehensive summary visualization
fig = plt.figure(figsize=(20, 15))

# Create a complex subplot layout
gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)

# 1. Main spatial distribution (large plot)
ax_main = fig.add_subplot(gs[0:2, 0:2])
scatter = ax_main.scatter(final_gdf.geometry.x, final_gdf.geometry.y,
                         c=final_gdf['viirs_luminosity'], s=60, 
                         cmap='viridis', alpha=0.8, edgecolors='white', linewidth=0.5)
plt.colorbar(scatter, ax=ax_main, label='VIIRS Luminosity')
ax_main.set_title('Integrated POI-VIIRS Dataset\nHadapsar, Pune', fontsize=16, fontweight='bold')
ax_main.set_xlabel('Longitude')
ax_main.set_ylabel('Latitude')
ax_main.grid(True, alpha=0.3)

# 2. Category distribution
ax2 = fig.add_subplot(gs[0, 2])
category_counts = final_gdf['category_group'].value_counts()
ax2.pie(category_counts.values, labels=category_counts.index, autopct='%1.0f%%')
ax2.set_title('POI Categories')

# 3. Cluster distribution
ax3 = fig.add_subplot(gs[0, 3])
cluster_counts = final_gdf['cluster'].value_counts().sort_index()
bars = ax3.bar(cluster_counts.index, cluster_counts.values, 
               color=[cluster_colors[i] for i in range(len(cluster_counts))])
ax3.set_title('Cluster Distribution')
ax3.set_xlabel('Cluster ID')
ax3.set_ylabel('Count')

# 4. Anomaly distribution
ax4 = fig.add_subplot(gs[1, 2])
anomaly_counts = final_gdf['anomaly_type'].value_counts()
colors_list = [anomaly_colors.get(atype, '#888888') for atype in anomaly_counts.index]
ax4.pie(anomaly_counts.values, labels=anomaly_counts.index, autopct='%1.0f%%', colors=colors_list)
ax4.set_title('Anomaly Types')

# 5. Luminosity histogram
ax5 = fig.add_subplot(gs[1, 3])
ax5.hist(final_gdf['viirs_luminosity'], bins=25, alpha=0.7, color='skyblue', edgecolor='black')
ax5.set_title('Luminosity Distribution')
ax5.set_xlabel('VIIRS Luminosity')
ax5.set_ylabel('Frequency')
ax5.grid(True, alpha=0.3)

# 6. Category vs luminosity boxplot
ax6 = fig.add_subplot(gs[2, 0:2])
final_gdf.boxplot(column='viirs_luminosity', by='category_group', ax=ax6)
ax6.set_title('Luminosity by Category')
ax6.set_xlabel('Category')
ax6.set_ylabel('VIIRS Luminosity')
plt.setp(ax6.xaxis.get_majorticklabels(), rotation=45, ha='right')

# 7. Mean luminosity by category
ax7 = fig.add_subplot(gs[2, 2])
mean_lum = final_gdf.groupby('category_group')['viirs_luminosity'].mean().sort_values(ascending=True)
bars = ax7.barh(range(len(mean_lum)), mean_lum.values)
ax7.set_yticks(range(len(mean_lum)))
ax7.set_yticklabels(mean_lum.index)
ax7.set_title('Mean Luminosity by Category')
ax7.set_xlabel('Mean VIIRS Luminosity')

# 8. Correlation matrix (simplified)
ax8 = fig.add_subplot(gs[2, 3])
# Create a simple correlation matrix
corr_data = final_gdf[['viirs_luminosity']].copy()
corr_data['commercial'] = (final_gdf['category_group'] == 'Commercial').astype(int)
corr_data['essential'] = (final_gdf['category_group'] == 'Essential Services').astype(int)
corr_matrix = corr_data.corr()

im = ax8.imshow(corr_matrix, cmap='RdBu', vmin=-1, vmax=1)
ax8.set_xticks(range(len(corr_matrix.columns)))
ax8.set_yticks(range(len(corr_matrix.columns)))
ax8.set_xticklabels(corr_matrix.columns, rotation=45, ha='right')
ax8.set_yticklabels(corr_matrix.columns)
ax8.set_title('Correlation Matrix')

# Add correlation values
for i in range(len(corr_matrix)):
    for j in range(len(corr_matrix.columns)):
        ax8.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', fontsize=10)

plt.suptitle('Comprehensive Integration Analysis Dashboard\nHadapsar POI-VIIRS Study', 
             fontsize=20, fontweight='bold', y=0.98)

plt.show()

## 6. Save Integrated Dataset

Finally, let's save our integrated dataset for use in subsequent analysis steps.

In [None]:
# Save the integrated dataset
output_file = integrator.processed_dir / "integrated_poi_viirs.geojson"
final_gdf.to_file(output_file, driver='GeoJSON')

# Also save as parquet for faster loading
parquet_file = integrator.processed_dir / "integrated_poi_viirs.parquet"
final_gdf.drop(columns=['geometry']).to_parquet(parquet_file)

print(f"Integrated dataset saved:")
print(f"  GeoJSON: {output_file}")
print(f"  Parquet: {parquet_file}")

# Save spatial statistics
import json
stats_file = integrator.processed_dir / "integration_statistics.json"
with open(stats_file, 'w') as f:
    json.dump(spatial_stats, f, indent=2, default=str)

print(f"  Statistics: {stats_file}")

In [None]:
# Create final summary
integration_summary = {
    'dataset_info': {
        'total_pois': len(final_gdf),
        'categories': final_gdf['category_group'].nunique(),
        'clusters': final_gdf['cluster'].nunique(),
        'anomaly_types': final_gdf['anomaly_type'].nunique()
    },
    'luminosity_analysis': {
        'mean_luminosity': float(final_gdf['viirs_luminosity'].mean()),
        'median_luminosity': float(final_gdf['viirs_luminosity'].median()),
        'max_luminosity': float(final_gdf['viirs_luminosity'].max()),
        'zero_luminosity_count': int((final_gdf['viirs_luminosity'] == 0).sum())
    },
    'category_analysis': dict(final_gdf['category_group'].value_counts()),
    'cluster_analysis': dict(final_gdf['cluster'].value_counts()),
    'anomaly_analysis': dict(final_gdf['anomaly_type'].value_counts()),
    'spatial_extent': {
        'bounds': final_gdf.total_bounds.tolist(),
        'crs': str(final_gdf.crs)
    }
}

print("\n" + "=" * 60)
print("INTEGRATION ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nDataset Information:")
for key, value in integration_summary['dataset_info'].items():
    print(f"  {key.replace('_', ' ').title()}: {value:,}")

print(f"\nLuminosity Analysis:")
for key, value in integration_summary['luminosity_analysis'].items():
    if 'count' in key:
        print(f"  {key.replace('_', ' ').title()}: {value:,}")
    else:
        print(f"  {key.replace('_', ' ').title()}: {value:.2f}")

print(f"\nTop 3 Categories by Count:")
top_categories = sorted(integration_summary['category_analysis'].items(), 
                       key=lambda x: x[1], reverse=True)[:3]
for category, count in top_categories:
    percentage = (count / integration_summary['dataset_info']['total_pois']) * 100
    print(f"  {category}: {count:,} ({percentage:.1f}%)")

print(f"\nAnomaly Distribution:")
for anomaly_type, count in integration_summary['anomaly_analysis'].items():
    percentage = (count / integration_summary['dataset_info']['total_pois']) * 100
    print(f"  {anomaly_type}: {count:,} ({percentage:.1f}%)")

print("\n" + "=" * 60)
print("INTEGRATION COMPLETED SUCCESSFULLY!")
print("=" * 60)

print(f"\nNext Steps:")
print(f"1. Run notebook 03_exploratory_analysis.ipynb for detailed analysis")
print(f"2. Run notebook 04_visualization.ipynb for creating visualizations")
print(f"3. Generate final reports and presentations")

# Save integration summary
summary_file = integrator.processed_dir / "integration_summary.json"
with open(summary_file, 'w') as f:
    json.dump(integration_summary, f, indent=2, default=str)

print(f"\nIntegration summary saved to: {summary_file}")