# Wildfire Data Exploration

This notebook explores the data collected for wildfire prediction, analyzing patterns, distributions, and relationships in the data.

## Table of Contents
1. Setup and Data Loading
2. Exploratory Data Analysis
3. Spatial Analysis
4. Temporal Analysis
5. Feature Correlations
6. Data Quality Assessment

In [None]:
import sys
import os
from pathlib import Path

# Add project root to path
project_root = str(Path().absolute().parent)
sys.path.append(project_root)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from datetime import datetime, timedelta

from src.data.collector import WildfireDataCollector
from src.data.processor import WildfireDataProcessor
from src.config import config

%matplotlib inline
plt.style.use('seaborn')
sns.set_style('whitegrid')

## 1. Data Collection and Loading

In [None]:
# Initialize data collector
collector = WildfireDataCollector(config['data'].region_bounds)
processor = WildfireDataProcessor(config)

# Collect data
raw_data = collector.collect_all_data(
    start_date='2020-01-01',
    end_date='2023-12-31'
)

# Process data
processed_data = processor.prepare_data(raw_data)

## 2. Exploratory Data Analysis

In [None]:
def plot_data_summary(data):
    """Plot summary statistics for the dataset"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot fire occurrence distribution
    sns.histplot(data['viirs'].sum(axis=(1, 2)), ax=axes[0, 0])
    axes[0, 0].set_title('Distribution of Fire Occurrences')
    axes[0, 0].set_xlabel('Number of Fire Pixels')
    
    # Plot temperature distribution
    sns.boxplot(data=data['weather']['temperature_2m'].flatten(), ax=axes[0, 1])
    axes[0, 1].set_title('Temperature Distribution')
    
    # Plot humidity distribution
    sns.boxplot(data=data['weather']['relative_humidity_2m'].flatten(), ax=axes[1, 0])
    axes[1, 0].set_title('Humidity Distribution')
    
    # Plot wind speed distribution
    wind_speed = np.sqrt(
        data['weather']['u_component_of_wind_10m']**2 +
        data['weather']['v_component_of_wind_10m']**2
    )
    sns.boxplot(data=wind_speed.flatten(), ax=axes[1, 1])
    axes[1, 1].set_title('Wind Speed Distribution')
    
    plt.tight_layout()
    plt.show()

plot_data_summary(raw_data)

## 3. Spatial Analysis

In [None]:
def create_fire_heatmap(data):
    """Create a heatmap of fire occurrences"""
    center_lat = (config['data'].region_bounds['lat_min'] + 
                 config['data'].region_bounds['lat_max']) / 2
    center_lon = (config['data'].region_bounds['lon_min'] + 
                 config['data'].region_bounds['lon_max']) / 2
    
    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
    
    # Add heatmap layer
    fire_points = []
    for lat_idx in range(data['viirs'].shape[1]):
        for lon_idx in range(data['viirs'].shape[2]):
            if data['viirs'][:, lat_idx, lon_idx].sum() > 0:
                lat = config['data'].region_bounds['lat_min'] + \
                      lat_idx * (config['data'].region_bounds['lat_max'] - 
                                config['data'].region_bounds['lat_min']) / data['viirs'].shape[1]
                lon = config['data'].region_bounds['lon_min'] + \
                      lon_idx * (config['data'].region_bounds['lon_max'] - 
                                config['data'].region_bounds['lon_min']) / data['viirs'].shape[2]
                fire_points.append([lat, lon, data['viirs'][:, lat_idx, lon_idx].sum()])
    
    folium.HeatMap(fire_points).add_to(m)
    return m

heatmap = create_fire_heatmap(raw_data)
heatmap

## 4. Temporal Analysis

In [None]:
def plot_temporal_patterns(data):
    """Plot temporal patterns in fire occurrences"""
    fig, axes = plt.subplots(2, 1, figsize=(15, 10))
    
    # Daily pattern
    daily_fires = data['viirs'].sum(axis=(1, 2))
    axes[0].plot(daily_fires)
    axes[0].set_title('Daily Fire Occurrences')
    axes[0].set_xlabel('Day')
    axes[0].set_ylabel('Number of Fire Pixels')
    
    # Monthly pattern
    monthly_fires = np.array([daily_fires[i:i+30].mean() 
                             for i in range(0, len(daily_fires), 30)])
    axes[1].plot(monthly_fires)
    axes[1].set_title('Monthly Fire Occurrences')
    axes[1].set_xlabel('Month')
    axes[1].set_ylabel('Average Number of Fire Pixels')
    
    plt.tight_layout()
    plt.show()

plot_temporal_patterns(raw_data)

## 5. Feature Correlations

In [None]:
def analyze_correlations(data):
    """Analyze correlations between features and fire occurrences"""
    # Create correlation matrix
    features = {
        'fires': data['viirs'].sum(axis=(1, 2)),
        'temperature': data['weather']['temperature_2m'].mean(axis=(1, 2)),
        'humidity': data['weather']['relative_humidity_2m'].mean(axis=(1, 2)),
        'wind_speed': np.sqrt(
            data['weather']['u_component_of_wind_10m']**2 +
            data['weather']['v_component_of_wind_10m']**2
        ).mean(axis=(1, 2)),
        'precipitation': data['weather']['total_precipitation'].mean(axis=(1, 2))
    }
    
    df = pd.DataFrame(features)
    
    # Plot correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlations')
    plt.show()

analyze_correlations(raw_data)

## 6. Data Quality Assessment

In [None]:
def assess_data_quality(data):
    """Assess data quality and completeness"""
    quality_metrics = {
        'Total Days': len(data['viirs']),
        'Missing Values (%)': {
            'VIIRS': np.isnan(data['viirs']).mean() * 100,
            'Temperature': np.isnan(data['weather']['temperature_2m']).mean() * 100,
            'Humidity': np.isnan(data['weather']['relative_humidity_2m']).mean() * 100,
            'Wind': np.isnan(data['weather']['u_component_of_wind_10m']).mean() * 100,
            'Precipitation': np.isnan(data['weather']['total_precipitation']).mean() * 100
        },
        'Data Range': {
            'Temperature': {
                'min': data['weather']['temperature_2m'].min(),
                'max': data['weather']['temperature_2m'].max()
            },
            'Humidity': {
                'min': data['weather']['relative_humidity_2m'].min(),
                'max': data['weather']['relative_humidity_2m'].max()
            }
        }
    }
    
    return pd.DataFrame.from_dict(quality_metrics, orient='index')

quality_report = assess_data_quality(raw_data)
quality_report