# Brisbane Flood Data Exploratory Analysis

This notebook explores flood-related data for Brisbane, including rainfall, water levels, and topographic information.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set plotting style
plt.style.use('ggplot')
sns.set_context('notebook')

# Display settings
%matplotlib inline
pd.set_option('display.max_columns', None)

## Data Loading

First, let's set up paths to our data directories and load sample data.

In [None]:
# Set up project paths
project_root = Path().absolute().parent
raw_data_dir = project_root / 'data' / 'raw'
processed_data_dir = project_root / 'data' / 'processed'
geo_data_dir = project_root / 'data' / 'geo'

print(f"Project root: {project_root}")
print(f"Raw data directory: {raw_data_dir}")
print(f"Processed data directory: {processed_data_dir}")
print(f"Geo data directory: {geo_data_dir}")

## Sample Data Creation

Since we don't have actual data yet, let's create some sample data for exploration.

In [None]:
# Create sample rainfall data
def create_sample_rainfall_data(start_date='2022-01-01', end_date='2022-03-31'):
    """Create sample daily rainfall data for Brisbane."""
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Create base rainfall with seasonal pattern
    rainfall = np.random.gamma(shape=0.5, scale=2.0, size=len(date_range))
    
    # Add some heavy rainfall events
    heavy_rain_indices = np.random.choice(len(date_range), size=5, replace=False)
    rainfall[heavy_rain_indices] += np.random.gamma(shape=5.0, scale=10.0, size=5)
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': date_range,
        'rainfall_mm': rainfall
    })
    
    return df

# Create sample river level data
def create_sample_river_data(start_date='2022-01-01', end_date='2022-03-31'):
    """Create sample daily river level data for Brisbane River."""
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Base river level with some randomness
    base_level = 1.0 + 0.2 * np.sin(np.linspace(0, 4*np.pi, len(date_range)))
    noise = np.random.normal(0, 0.05, len(date_range))
    river_level = base_level + noise
    
    # Create flood events with lag after heavy rainfall
    flood_indices = (heavy_rain_indices + 1) % len(date_range)  # Day after heavy rain
    river_level[flood_indices] += np.random.uniform(1.0, 3.0, size=len(flood_indices))
    
    # Decay flood levels over next few days
    for i in flood_indices:
        for j in range(1, 6):
            idx = (i + j) % len(date_range)
            river_level[idx] += max(0, river_level[i] * 0.8**j)
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': date_range,
        'river_level_m': river_level
    })
    
    return df

# Generate sample data
rainfall_data = create_sample_rainfall_data()
river_data = create_sample_river_data()

# Merge datasets
flood_data = pd.merge(rainfall_data, river_data, on='date')

# Display the first few rows
flood_data.head()

## Data Visualization

Let's visualize the relationship between rainfall and river levels.

In [None]:
# Plot rainfall and river level time series
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Rainfall plot
ax1.bar(flood_data['date'], flood_data['rainfall_mm'], color='skyblue', alpha=0.7)
ax1.set_ylabel('Rainfall (mm)')
ax1.set_title('Daily Rainfall in Brisbane')

# River level plot
ax2.plot(flood_data['date'], flood_data['river_level_m'], color='navy', linewidth=2)
ax2.axhline(y=3.5, color='red', linestyle='--', alpha=0.7, label='Minor Flood Level')
ax2.axhline(y=4.5, color='orange', linestyle='--', alpha=0.7, label='Moderate Flood Level')
ax2.axhline(y=5.5, color='darkred', linestyle='--', alpha=0.7, label='Major Flood Level')
ax2.set_ylabel('River Level (m)')
ax2.set_title('Brisbane River Level')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of rainfall vs river level
plt.figure(figsize=(10, 6))
sns.scatterplot(x='rainfall_mm', y='river_level_m', data=flood_data, alpha=0.7)
plt.title('Relationship Between Rainfall and River Level')
plt.xlabel('Rainfall (mm)')
plt.ylabel('River Level (m)')
plt.grid(True, alpha=0.3)
plt.show()

## Lag Analysis

Let's examine the lag effect between rainfall and river level changes.

In [None]:
# Create lagged features
for lag in range(1, 6):
    flood_data[f'rainfall_lag_{lag}d'] = flood_data['rainfall_mm'].shift(lag)

# Drop NaN values
flood_data_lag = flood_data.dropna()

# Display the first few rows
flood_data_lag.head()

In [None]:
# Calculate correlation matrix
correlation = flood_data_lag.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')
plt.title('Correlation Matrix: Rainfall and River Level with Lags')
plt.tight_layout()
plt.show()

## Flood Event Analysis

Let's identify and analyze flood events in our sample data.

In [None]:
# Define flood thresholds
minor_flood = 3.5
moderate_flood = 4.5
major_flood = 5.5

# Classify flood levels
flood_data['flood_category'] = pd.cut(
    flood_data['river_level_m'],
    bins=[-float('inf'), minor_flood, moderate_flood, major_flood, float('inf')],
    labels=['No Flood', 'Minor', 'Moderate', 'Major']
)

# Count days in each flood category
flood_counts = flood_data['flood_category'].value_counts().sort_index()

# Plot flood category distribution
plt.figure(figsize=(10, 6))
flood_counts.plot(kind='bar', color=['green', 'yellow', 'orange', 'red'])
plt.title('Distribution of Flood Categories')
plt.xlabel('Flood Category')
plt.ylabel('Number of Days')
plt.grid(axis='y', alpha=0.3)
plt.show()

## Cumulative Rainfall Analysis

Let's analyze the effect of cumulative rainfall on flood events.

In [None]:
# Calculate rolling sum of rainfall (3-day and 7-day windows)
flood_data['rainfall_3d_sum'] = flood_data['rainfall_mm'].rolling(window=3).sum()
flood_data['rainfall_7d_sum'] = flood_data['rainfall_mm'].rolling(window=7).sum()

# Plot cumulative rainfall vs river level
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 3-day cumulative rainfall
sns.scatterplot(x='rainfall_3d_sum', y='river_level_m', 
                hue='flood_category', data=flood_data.dropna(),
                palette=['green', 'yellow', 'orange', 'red'], ax=ax1)
ax1.set_title('3-Day Cumulative Rainfall vs River Level')
ax1.set_xlabel('3-Day Cumulative Rainfall (mm)')
ax1.set_ylabel('River Level (m)')
ax1.grid(True, alpha=0.3)

# 7-day cumulative rainfall
sns.scatterplot(x='rainfall_7d_sum', y='river_level_m', 
                hue='flood_category', data=flood_data.dropna(),
                palette=['green', 'yellow', 'orange', 'red'], ax=ax2)
ax2.set_title('7-Day Cumulative Rainfall vs River Level')
ax2.set_xlabel('7-Day Cumulative Rainfall (mm)')
ax2.set_ylabel('River Level (m)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Next Steps

Based on this exploratory analysis, here are the next steps for our project:

1. **Data Collection**:
   - Obtain real rainfall data from BOM for Brisbane area stations
   - Collect river level data from multiple gauging stations
   - Acquire dam release information from SEQ Water
   - Download high-resolution DEM data for Brisbane

2. **Data Processing**:
   - Clean and standardize all data sources
   - Implement proper time-series alignment
   - Calculate derived features (cumulative rainfall, rate of change, etc.)

3. **Modeling**:
   - Develop regression models to predict river levels from rainfall data
   - Implement time-series forecasting for flood prediction
   - Create flood extent models using DEM data

4. **Visualization**:
   - Implement interactive maps showing flood extents
   - Create time-series visualizations of historical events
   - Develop dashboards for monitoring current conditions