# Data Preprocessing - Foursquare POI and VIIRS Nighttime Lights

This notebook demonstrates the data preprocessing pipeline for integrating Foursquare POI data with VIIRS nighttime lights data for Hadapsar, Pune analysis.

## Objectives
1. Load and preprocess Foursquare POI data
2. Load and preprocess VIIRS nighttime lights data
3. Filter data to Hadapsar region
4. Prepare data for spatial integration


In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import custom modules
from preprocessing.foursquare_processor import FoursquareProcessor
from preprocessing.viirs_processor import VIIRSProcessor

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Foursquare POI Data Preprocessing

Let's start by processing the Foursquare POI data for the Hadapsar region.

In [None]:
# Initialize Foursquare processor
foursquare_processor = FoursquareProcessor(data_dir="../data")

print("Foursquare processor initialized")
print(f"Data directory: {foursquare_processor.data_dir}")
print(f"Hadapsar bounds: {foursquare_processor.HADAPSAR_BOUNDS}")

In [None]:
# Process Foursquare POI data
print("Starting Foursquare data processing...")
poi_output_file = foursquare_processor.process_foursquare_data()
print(f"\nProcessing complete! Output saved to: {poi_output_file}")

In [None]:
# Load and examine the processed POI data
poi_gdf = gpd.read_file(poi_output_file)

print(f"Loaded POI data: {len(poi_gdf)} records")
print(f"\nColumns: {list(poi_gdf.columns)}")
print(f"\nData types:")
print(poi_gdf.dtypes)

# Display first few records
print(f"\nFirst 5 records:")
display(poi_gdf.head())

In [None]:
# Visualize POI categories
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Category distribution
category_counts = poi_gdf['category_group'].value_counts()
ax1.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
ax1.set_title('POI Category Distribution')

# Spatial distribution
poi_gdf.plot(ax=ax2, column='category_group', legend=True, markersize=20, alpha=0.7)
ax2.set_title('Spatial Distribution of POIs in Hadapsar')
ax2.set_xlabel('Longitude')
ax2.set_ylabel('Latitude')

plt.tight_layout()
plt.show()

# Print category statistics
print("\nCategory Statistics:")
for category, count in category_counts.items():
    percentage = (count / len(poi_gdf)) * 100
    print(f"{category}: {count} ({percentage:.1f}%)")

## 2. VIIRS Nighttime Lights Data Preprocessing

Now let's process the VIIRS nighttime lights data for the same region.

In [None]:
# Initialize VIIRS processor
viirs_processor = VIIRSProcessor(data_dir="../data")

print("VIIRS processor initialized")
print(f"Data directory: {viirs_processor.data_dir}")
print(f"Study area bounds: {viirs_processor.HADAPSAR_BOUNDS}")

In [None]:
# Process VIIRS data
print("Starting VIIRS data processing...")
viirs_output_file, viirs_stats = viirs_processor.process_viirs_data(year=2023)
print(f"\nProcessing complete! Output saved to: {viirs_output_file}")

In [None]:
# Examine VIIRS statistics
print("VIIRS Data Statistics:")
print("=" * 30)
for key, value in viirs_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

In [None]:
# Load and visualize VIIRS data
import rasterio
from rasterio.plot import show

# Load the clipped VIIRS raster
with rasterio.open(viirs_output_file) as src:
    viirs_data = src.read(1, masked=True)
    viirs_transform = src.transform
    viirs_crs = src.crs
    viirs_bounds = src.bounds

print(f"VIIRS data shape: {viirs_data.shape}")
print(f"VIIRS CRS: {viirs_crs}")
print(f"VIIRS bounds: {viirs_bounds}")

# Visualize VIIRS data
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# VIIRS raster
with rasterio.open(viirs_output_file) as src:
    show(src, ax=ax1, cmap='viridis', title='VIIRS Nighttime Lights - Hadapsar')

# Luminosity histogram
valid_data = viirs_data[~viirs_data.mask]
ax2.hist(valid_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
ax2.set_xlabel('Luminosity Value')
ax2.set_ylabel('Frequency')
ax2.set_title('VIIRS Luminosity Distribution')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Data Quality Assessment

Let's assess the quality and coverage of both datasets.

In [None]:
# POI data quality assessment
print("POI Data Quality Assessment")
print("=" * 40)

# Check for missing values
missing_values = poi_gdf.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0])

# Check coordinate validity
lat_range = (poi_gdf.geometry.y.min(), poi_gdf.geometry.y.max())
lon_range = (poi_gdf.geometry.x.min(), poi_gdf.geometry.x.max())
print(f"\nLatitude range: {lat_range[0]:.6f} to {lat_range[1]:.6f}")
print(f"Longitude range: {lon_range[0]:.6f} to {lon_range[1]:.6f}")

# Check for duplicate POIs
duplicates = poi_gdf.duplicated(subset=['latitude', 'longitude']).sum()
print(f"\nDuplicate locations: {duplicates}")

# Category distribution
print(f"\nUnique categories: {poi_gdf['category_group'].nunique()}")
print(f"Category distribution:")
print(poi_gdf['category_group'].value_counts())

In [None]:
# VIIRS data quality assessment
print("VIIRS Data Quality Assessment")
print("=" * 40)

# Basic statistics
valid_pixels = np.sum(~viirs_data.mask)
total_pixels = viirs_data.size
coverage_percent = (valid_pixels / total_pixels) * 100

print(f"Total pixels: {total_pixels:,}")
print(f"Valid pixels: {valid_pixels:,}")
print(f"Data coverage: {coverage_percent:.1f}%")

# Luminosity statistics
valid_data = viirs_data[~viirs_data.mask]
if len(valid_data) > 0:
    print(f"\nLuminosity statistics:")
    print(f"  Min: {valid_data.min():.4f}")
    print(f"  Max: {valid_data.max():.4f}")
    print(f"  Mean: {valid_data.mean():.4f}")
    print(f"  Median: {np.median(valid_data):.4f}")
    print(f"  Std: {valid_data.std():.4f}")
    
    # Check for anomalous values
    zero_values = np.sum(valid_data == 0)
    negative_values = np.sum(valid_data < 0)
    high_values = np.sum(valid_data > 100)  # Arbitrary threshold
    
    print(f"\nData distribution:")
    print(f"  Zero values: {zero_values} ({zero_values/len(valid_data)*100:.1f}%)")
    print(f"  Negative values: {negative_values}")
    print(f"  High values (>100): {high_values}")

## 4. Spatial Coverage Analysis

Let's analyze the spatial coverage and alignment of both datasets.

In [None]:
# Create overlay visualization
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

# Plot VIIRS data as background
with rasterio.open(viirs_output_file) as src:
    show(src, ax=ax, cmap='viridis', alpha=0.7)

# Overlay POI locations
poi_gdf.plot(ax=ax, column='category_group', legend=True, 
            markersize=30, alpha=0.8, edgecolors='white', linewidth=0.5)

ax.set_title('Spatial Overlay: POIs on VIIRS Nighttime Lights\nHadapsar, Pune', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')

# Add grid
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Calculate spatial extent comparison
poi_bounds = poi_gdf.total_bounds
viirs_bounds_list = [viirs_bounds.left, viirs_bounds.bottom, viirs_bounds.right, viirs_bounds.top]

print("Spatial Extent Comparison:")
print("=" * 30)
print(f"POI bounds: {poi_bounds}")
print(f"VIIRS bounds: {viirs_bounds_list}")

# Check if POI data falls within VIIRS coverage
poi_within_viirs = (
    poi_bounds[0] >= viirs_bounds.left and
    poi_bounds[1] >= viirs_bounds.bottom and
    poi_bounds[2] <= viirs_bounds.right and
    poi_bounds[3] <= viirs_bounds.top
)

print(f"\nPOI data within VIIRS coverage: {poi_within_viirs}")

## 5. Summary and Next Steps

Let's summarize the preprocessing results and prepare for data integration.

In [None]:
# Create summary statistics
summary = {
    'POI Data': {
        'total_records': len(poi_gdf),
        'categories': poi_gdf['category_group'].nunique(),
        'spatial_extent_km2': 'Hadapsar region',
        'coordinate_system': str(poi_gdf.crs),
        'data_quality': 'Good' if missing_values.sum() == 0 else 'Needs attention'
    },
    'VIIRS Data': {
        'total_pixels': int(total_pixels),
        'valid_pixels': int(valid_pixels),
        'coverage_percent': f"{coverage_percent:.1f}%",
        'coordinate_system': str(viirs_crs),
        'luminosity_range': f"{valid_data.min():.2f} to {valid_data.max():.2f}",
        'mean_luminosity': f"{valid_data.mean():.2f}"
    }
}

print("PREPROCESSING SUMMARY")
print("=" * 50)

for dataset, metrics in summary.items():
    print(f"\n{dataset}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

print("\n" + "=" * 50)
print("PREPROCESSING COMPLETED SUCCESSFULLY!")
print("=" * 50)

print("\nOutput Files:")
print(f"- POI data: {poi_output_file}")
print(f"- VIIRS data: {viirs_output_file}")

print("\nNext Steps:")
print("1. Run notebook 02_data_integration.ipynb for spatial integration")
print("2. Perform exploratory data analysis")
print("3. Create visualizations and dashboards")

In [None]:
# Save summary to file
import json

# Save preprocessing summary
summary_file = "../data/processed/preprocessing_summary.json"
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"Preprocessing summary saved to: {summary_file}")

# Display file structure
data_dir = Path("../data")
print(f"\nData directory structure:")
for item in sorted(data_dir.rglob("*")):
    if item.is_file():
        relative_path = item.relative_to(data_dir.parent)
        file_size = item.stat().st_size / 1024  # KB
        print(f"  {relative_path} ({file_size:.1f} KB)")