These visualization methods provide different insights into your data:

- **Time series visualizations** show patterns over time.
- **Correlation analysis** reveals relationships between stations and variables.
- **Boxplots** compare the distribution of values across stations.
- **Lag analysis** helps understand how rainfall affects water levels over time.
- **Interactive dashboards** allow exploration of the data.
- **Event analysis** focuses on specific high-water events and their characteristics.

### Visualizing Kelani Ganga Water Level and Rainfall Data

Load the Data First

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import numpy as np
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")

def load_kelani_data():
    """Load water level and rainfall data for Kelani Ganga stations"""
    
    # Load all water level data files
    water_level_path = "../data/rivers/kelani_ganga/water_levels"
    water_files = glob.glob(os.path.join(water_level_path, "*.csv"))
    water_data = {}

    print(f"Found {len(water_files)} water level files")
    for file in water_files:
        station_name = os.path.basename(file).replace("_water_level.csv", "")
        df = pd.read_csv(file, parse_dates=['recorded_at'], index_col='recorded_at')
        water_data[station_name] = df
        print(f"Loaded {station_name} with {len(df)} records")

    # Load all rainfall data files
    rainfall_path = "../data/rivers/kelani_ganga/rainfall"
    rainfall_files = glob.glob(os.path.join(rainfall_path, "*.csv"))
    rainfall_data = {}

    print(f"Found {len(rainfall_files)} rainfall files")
    for file in rainfall_files:
        station_name = os.path.basename(file).replace("_rainfall.csv", "")
        df = pd.read_csv(file, parse_dates=['end_time'], index_col='end_time')
        rainfall_data[station_name] = df
        print(f"Loaded {station_name} with {len(df)} records")
    
    # Check if we have data to visualize
    if not water_data:
        print("No water level data found! Please check the path.")
    if not rainfall_data:
        print("No rainfall data found! Please check the path.")
        
    return water_data, rainfall_data

# Load the data
water_data, rainfall_data = load_kelani_data()

Found 4 water level files
Loaded glencourse with 2467 records
Loaded hanwella with 2467 records
Loaded kithulgala with 2467 records
Loaded nagalagam_street with 2467 records
Found 3 rainfall files
Loaded glencourse with 1237 records
Loaded hanwella with 1237 records
Loaded kithulgala with 1237 records


1. Time-Series Visualization with Multiple Stations per 6 months

In [6]:
def visualize_river_data(river_name, period_months=6):
    """
    Create visualizations for a river's water level and rainfall data in 6-month periods
    
    Args:
        river_name: Name of the river (folder name in data/rivers/)
        period_months: Number of months per visualization (default: 6)
    """
    # Define paths for the river data
    river_path = f"../data/rivers/{river_name}"
    water_level_path = f"{river_path}/water_levels"
    rainfall_path = f"{river_path}/rainfall"
    
    # Create output directory
    output_dir = f"./visualizations/{river_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Processing {river_name.replace('_', ' ').title()} data...")
    
    # Get all water level and rainfall files
    water_files = glob.glob(os.path.join(water_level_path, "*.csv"))
    rainfall_files = glob.glob(os.path.join(rainfall_path, "*.csv"))
    
    if not water_files and not rainfall_files:
        print(f"No data files found for {river_name}")
        return
    
    # Process each station
    for water_file in water_files:
        station_name = os.path.basename(water_file).replace("_water_level.csv", "")
        print(f"  Processing {station_name} station...")
        
        # Find matching rainfall file
        rainfall_file = os.path.join(rainfall_path, f"{station_name}_rainfall.csv")
        has_rainfall = os.path.exists(rainfall_file)
        
        # Load water level data
        water_data = pd.read_csv(water_file, parse_dates=['recorded_at'], index_col='recorded_at')
        
        # Load rainfall data if available
        rainfall_data = None
        if has_rainfall:
            rainfall_data = pd.read_csv(rainfall_file, parse_dates=['end_time'], index_col='end_time')
        
        # Print summary statistics
        print(f"    Water level data: {len(water_data)} records from {water_data.index.min().date()} to {water_data.index.max().date()}")
        if has_rainfall:
            print(f"    Rainfall data: {len(rainfall_data)} records from {rainfall_data.index.min().date()} to {rainfall_data.index.max().date()}")
        
        # Create multi-month periods
        visualize_multi_month_periods(water_data, rainfall_data, station_name, river_name, output_dir, period_months)

def visualize_multi_month_periods(water_data, rainfall_data, station_name, river_name, output_dir, period_months=6):
    """Create visualizations for multi-month periods"""
    # Get date range covering both datasets
    start_date = water_data.index.min()
    end_date = water_data.index.max()
    
    if rainfall_data is not None:
        start_date = min(start_date, rainfall_data.index.min())
        end_date = max(end_date, rainfall_data.index.max())
    
    # Round to month boundaries
    start_date = pd.Timestamp(start_date.year, start_date.month, 1)
    if end_date.month == 12:
        end_date = pd.Timestamp(end_date.year + 1, 1, 1) - pd.Timedelta(days=1)
    else:
        end_date = pd.Timestamp(end_date.year, end_date.month + 1, 1) - pd.Timedelta(days=1)
    
    # Create periods of multiple months
    periods = []
    current_date = start_date
    while current_date <= end_date:
        # Calculate end of period
        if current_date.month + period_months > 12:
            end_year = current_date.year + (current_date.month + period_months - 1) // 12
            end_month = (current_date.month + period_months - 1) % 12 + 1
        else:
            end_year = current_date.year
            end_month = current_date.month + period_months
        
        period_end = pd.Timestamp(end_year, end_month, 1) - pd.Timedelta(days=1)
        if period_end > end_date:
            period_end = end_date
        
        periods.append((current_date, period_end))
        
        # Move to next period
        if current_date.month + period_months > 12:
            current_date = pd.Timestamp(current_date.year + (current_date.month + period_months) // 12, 
                                       (current_date.month + period_months - 1) % 12 + 1, 1)
        else:
            current_date = pd.Timestamp(current_date.year, current_date.month + period_months, 1)
    
    # Create visualizations for each period
    for period_start, period_end in periods:
        # Filter data for this period
        water_period = water_data[(water_data.index >= period_start) & (water_data.index <= period_end)]
        
        if rainfall_data is not None:
            rain_period = rainfall_data[(rainfall_data.index >= period_start) & (rainfall_data.index <= period_end)]
        else:
            rain_period = None
        
        if len(water_period) == 0 and (rain_period is None or len(rain_period) == 0):
            # Skip periods with no data
            continue
            
        # Create figure with two subplots
        fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True, 
                               gridspec_kw={'height_ratios': [2, 1]})
        
        period_name = f"{period_start.strftime('%b %Y')} - {period_end.strftime('%b %Y')}"
        
        # Plot water level data
        if len(water_period) > 0:
            axes[0].plot(water_period.index, water_period['water_level'], '-', 
                      color='blue', linewidth=1.2, label=f'Water Level')
            
            # Add points for better visibility at sparse data points
            if len(water_period) < 100:
                axes[0].plot(water_period.index, water_period['water_level'], 'o', 
                          color='blue', markersize=4, alpha=0.7)
            
            # Format y-axis
            axes[0].set_ylabel('Water Level (m)', fontsize=12)
            axes[0].set_title(f'Water Level at {station_name.replace("_", " ").title()} - {period_name}', fontsize=14)
            
            # Add reference lines if we have metadata
            station_metadata = get_station_metadata(station_name, river_name)
            if station_metadata:
                if 'alert_level' in station_metadata and station_metadata['alert_level'] is not None:
                    axes[0].axhline(y=station_metadata['alert_level'], color='gold', linestyle='--', 
                                 label=f'Alert Level ({station_metadata["alert_level"]}m)')
                if 'minor_flood_level' in station_metadata and station_metadata['minor_flood_level'] is not None:
                    axes[0].axhline(y=station_metadata['minor_flood_level'], color='orange', linestyle='--', 
                                 label=f'Minor Flood ({station_metadata["minor_flood_level"]}m)')
                if 'major_flood_level' in station_metadata and station_metadata['major_flood_level'] is not None:
                    axes[0].axhline(y=station_metadata['major_flood_level'], color='red', linestyle='--', 
                                 label=f'Major Flood ({station_metadata["major_flood_level"]}m)')
            
            axes[0].grid(True)
            axes[0].legend(loc='best')
        else:
            axes[0].text(0.5, 0.5, f'No water level data for this period', 
                      ha='center', va='center', transform=axes[0].transAxes, fontsize=12)
        
        # Plot rainfall data if available
        if rain_period is not None and len(rain_period) > 0:
            # Aggregate by day for clearer visualization over long periods
            daily_rain = rain_period['rainfall_in_mm'].resample('D').sum()
            
            # Create bar chart for rainfall
            bars = axes[1].bar(daily_rain.index, daily_rain.values, width=1.0, 
                             color='skyblue', alpha=0.7, label=f'Daily Rainfall')
            
            # Format y-axis
            axes[1].set_ylabel('Rainfall (mm)', fontsize=12)
            axes[1].set_title(f'Daily Rainfall at {station_name.replace("_", " ").title()} - {period_name}', fontsize=14)
            
            # Calculate some statistics to display
            total_rain = daily_rain.sum()
            max_daily = daily_rain.max()
            rainy_days = (daily_rain > 0).sum()
            
            # Add statistics as text
            axes[1].text(0.02, 0.92, 
                       f'Total Rainfall: {total_rain:.1f}mm\n'
                       f'Max Daily: {max_daily:.1f}mm\n'
                       f'Rainy Days: {rainy_days}',
                       transform=axes[1].transAxes, 
                       bbox=dict(facecolor='white', alpha=0.8))
            
            axes[1].grid(True, axis='y')
            axes[1].legend(loc='upper right')
        else:
            axes[1].text(0.5, 0.5, f'No rainfall data for this period', 
                      ha='center', va='center', transform=axes[1].transAxes, fontsize=12)
        
        # Format x-axis based on period length
        days_in_period = (period_end - period_start).days + 1
        
        if days_in_period <= 31:
            # For shorter periods, show each day
            date_format = '%d %b'
            axes[1].xaxis.set_major_locator(mdates.DayLocator(interval=1))
        elif days_in_period <= 90:
            # For medium periods, show every week
            date_format = '%d %b'
            axes[1].xaxis.set_major_locator(mdates.WeekdayLocator(interval=1, byweekday=0))  # Monday
        else:
            # For longer periods, show every month
            date_format = '%b %Y'
            axes[1].xaxis.set_major_locator(mdates.MonthLocator())
        
        axes[1].xaxis.set_major_formatter(mdates.DateFormatter(date_format))
        plt.xticks(rotation=45)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save figure
        filename = f"{output_dir}/{station_name}_{period_start.strftime('%Y%m')}_to_{period_end.strftime('%Y%m')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"    Saved {period_name} plot to {filename}")
        
        plt.close(fig)

def get_station_metadata(station_name, river_name):
    """Try to get metadata for a station if available"""
    # This would ideally come from your database, but we can also try to
    # extract it from existing data files if you have them stored somewhere
    try:
        # Check if we have a metadata file or use hardcoded values for known stations
        if river_name == 'kelani_ganga' and station_name == 'hanwella':
            return {
                'alert_level': 5.0,       # Example value
                'minor_flood_level': 7.0,  # Example value
                'major_flood_level': 8.0   # Example value
            }
        # Add more stations as needed
    except:
        pass
    
    return None

# Example usage for Kelani Ganga
visualize_river_data('kelani_ganga', period_months=6)

# To process all rivers in your data directory:
def process_all_rivers():
    rivers_dir = "../data/rivers/"
    rivers = [d for d in os.listdir(rivers_dir) if os.path.isdir(os.path.join(rivers_dir, d))]
    
    print(f"Found {len(rivers)} rivers to process")
    for river in rivers:
        visualize_river_data(river, period_months=6)

# Uncomment to process all rivers
# process_all_rivers()

Processing Kelani Ganga data...
  Processing glencourse station...
    Water level data: 2467 records from 2021-03-27 to 2025-03-25
    Rainfall data: 1237 records from 2021-03-27 to 2025-03-25
    Saved Mar 2021 - Aug 2021 plot to ./visualizations/kelani_ganga/glencourse_202103_to_202108.png
    Saved Sep 2021 - Feb 2022 plot to ./visualizations/kelani_ganga/glencourse_202109_to_202202.png
    Saved Mar 2022 - Aug 2022 plot to ./visualizations/kelani_ganga/glencourse_202203_to_202208.png
    Saved Sep 2022 - Feb 2023 plot to ./visualizations/kelani_ganga/glencourse_202209_to_202302.png
    Saved Mar 2023 - Aug 2023 plot to ./visualizations/kelani_ganga/glencourse_202303_to_202308.png
    Saved Sep 2023 - Feb 2024 plot to ./visualizations/kelani_ganga/glencourse_202309_to_202402.png
    Saved Mar 2024 - Aug 2024 plot to ./visualizations/kelani_ganga/glencourse_202403_to_202408.png
    Saved Sep 2024 - Feb 2025 plot to ./visualizations/kelani_ganga/glencourse_202409_to_202502.png
    Sa

In [10]:
def visualize_river_data(river_name, period_months=12):
    """
    Create visualizations for a river's water level and rainfall data in 12-month periods
    
    Args:
        river_name: Name of the river (folder name in data/rivers/)
        period_months: Number of months per visualization (default: 12)
    """
    # Define paths for the river data
    river_path = f"../data/rivers/{river_name}"
    water_level_path = f"{river_path}/water_levels"
    rainfall_path = f"{river_path}/rainfall"
    
    # Create output directory
    output_dir = f"./visualizations/{river_name}"
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Processing {river_name.replace('_', ' ').title()} data...")
    
    # Get all water level and rainfall files
    water_files = glob.glob(os.path.join(water_level_path, "*.csv"))
    rainfall_files = glob.glob(os.path.join(rainfall_path, "*.csv"))
    
    if not water_files and not rainfall_files:
        print(f"No data files found for {river_name}")
        return
    
    # Process each station
    for water_file in water_files:
        station_name = os.path.basename(water_file).replace("_water_level.csv", "")
        print(f"  Processing {station_name} station...")
        
        # Find matching rainfall file
        rainfall_file = os.path.join(rainfall_path, f"{station_name}_rainfall.csv")
        has_rainfall = os.path.exists(rainfall_file)
        
        # Load water level data
        water_data = pd.read_csv(water_file, parse_dates=['recorded_at'], index_col='recorded_at')
        
        # Load rainfall data if available
        rainfall_data = None
        if has_rainfall:
            rainfall_data = pd.read_csv(rainfall_file, parse_dates=['end_time'], index_col='end_time')
        
        # Print summary statistics
        print(f"    Water level data: {len(water_data)} records from {water_data.index.min().date()} to {water_data.index.max().date()}")
        if has_rainfall:
            print(f"    Rainfall data: {len(rainfall_data)} records from {rainfall_data.index.min().date()} to {rainfall_data.index.max().date()}")
        
        # Create multi-month periods
        visualize_multi_month_periods(water_data, rainfall_data, station_name, river_name, output_dir, period_months)

def visualize_multi_month_periods(water_data, rainfall_data, station_name, river_name, output_dir, period_months=6):
    """Create visualizations for multi-month periods"""
    # Get date range covering both datasets
    start_date = water_data.index.min()
    end_date = water_data.index.max()
    
    if rainfall_data is not None:
        start_date = min(start_date, rainfall_data.index.min())
        end_date = max(end_date, rainfall_data.index.max())
    
    # Round to month boundaries
    start_date = pd.Timestamp(start_date.year, start_date.month, 1)
    if end_date.month == 12:
        end_date = pd.Timestamp(end_date.year + 1, 1, 1) - pd.Timedelta(days=1)
    else:
        end_date = pd.Timestamp(end_date.year, end_date.month + 1, 1) - pd.Timedelta(days=1)
    
    # Create periods of multiple months
    periods = []
    current_date = start_date
    while current_date <= end_date:
        # Calculate end of period
        if current_date.month + period_months > 12:
            end_year = current_date.year + (current_date.month + period_months - 1) // 12
            end_month = (current_date.month + period_months - 1) % 12 + 1
        else:
            end_year = current_date.year
            end_month = current_date.month + period_months
        
        period_end = pd.Timestamp(end_year, end_month, 1) - pd.Timedelta(days=1)
        if period_end > end_date:
            period_end = end_date
        
        periods.append((current_date, period_end))
        
        # Move to next period
        if current_date.month + period_months > 12:
            current_date = pd.Timestamp(current_date.year + (current_date.month + period_months) // 12, 
                                       (current_date.month + period_months - 1) % 12 + 1, 1)
        else:
            current_date = pd.Timestamp(current_date.year, current_date.month + period_months, 1)
    
    # Create visualizations for each period
    for period_start, period_end in periods:
        # Filter data for this period
        water_period = water_data[(water_data.index >= period_start) & (water_data.index <= period_end)]
        
        if rainfall_data is not None:
            rain_period = rainfall_data[(rainfall_data.index >= period_start) & (rainfall_data.index <= period_end)]
        else:
            rain_period = None
        
        if len(water_period) == 0 and (rain_period is None or len(rain_period) == 0):
            # Skip periods with no data
            continue
            
        # Create figure with two subplots
        fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True, 
                               gridspec_kw={'height_ratios': [2, 1]})
        
        period_name = f"{period_start.strftime('%b %Y')} - {period_end.strftime('%b %Y')}"
        
        # Plot water level data
        if len(water_period) > 0:
            axes[0].plot(water_period.index, water_period['water_level'], '-', 
                      color='blue', linewidth=1.2, label=f'Water Level')
            
            # Add points for better visibility at sparse data points
            if len(water_period) < 100:
                axes[0].plot(water_period.index, water_period['water_level'], 'o', 
                          color='blue', markersize=4, alpha=0.7)
            
            # Format y-axis
            axes[0].set_ylabel('Water Level (m)', fontsize=12)
            axes[0].set_title(f'Water Level at {station_name.replace("_", " ").title()} - {period_name}', fontsize=14)
            
            # Add reference lines if we have metadata
            station_metadata = get_station_metadata(station_name, river_name)
            if station_metadata:
                if 'alert_level' in station_metadata and station_metadata['alert_level'] is not None:
                    axes[0].axhline(y=station_metadata['alert_level'], color='gold', linestyle='--', 
                                 label=f'Alert Level ({station_metadata["alert_level"]}m)')
                if 'minor_flood_level' in station_metadata and station_metadata['minor_flood_level'] is not None:
                    axes[0].axhline(y=station_metadata['minor_flood_level'], color='orange', linestyle='--', 
                                 label=f'Minor Flood ({station_metadata["minor_flood_level"]}m)')
                if 'major_flood_level' in station_metadata and station_metadata['major_flood_level'] is not None:
                    axes[0].axhline(y=station_metadata['major_flood_level'], color='red', linestyle='--', 
                                 label=f'Major Flood ({station_metadata["major_flood_level"]}m)')
            
            axes[0].grid(True)
            axes[0].legend(loc='best')
        else:
            axes[0].text(0.5, 0.5, f'No water level data for this period', 
                      ha='center', va='center', transform=axes[0].transAxes, fontsize=12)
        
        # Plot rainfall data if available
        if rain_period is not None and len(rain_period) > 0:
            # Aggregate by day for clearer visualization over long periods
            daily_rain = rain_period['rainfall_in_mm'].resample('D').sum()
            
            # Create bar chart for rainfall
            bars = axes[1].bar(daily_rain.index, daily_rain.values, width=1.0, 
                             color='skyblue', alpha=0.7, label=f'Daily Rainfall')
            
            # Format y-axis
            axes[1].set_ylabel('Rainfall (mm)', fontsize=12)
            axes[1].set_title(f'Daily Rainfall at {station_name.replace("_", " ").title()} - {period_name}', fontsize=14)
            
            # Calculate some statistics to display
            total_rain = daily_rain.sum()
            max_daily = daily_rain.max()
            rainy_days = (daily_rain > 0).sum()
            
            # Add statistics as text
            axes[1].text(0.02, 0.92, 
                       f'Total Rainfall: {total_rain:.1f}mm\n'
                       f'Max Daily: {max_daily:.1f}mm\n'
                       f'Rainy Days: {rainy_days}',
                       transform=axes[1].transAxes, 
                       bbox=dict(facecolor='white', alpha=0.8))
            
            axes[1].grid(True, axis='y')
            axes[1].legend(loc='upper right')
        else:
            axes[1].text(0.5, 0.5, f'No rainfall data for this period', 
                      ha='center', va='center', transform=axes[1].transAxes, fontsize=12)
        
        # Format x-axis based on period length
        days_in_period = (period_end - period_start).days + 1
        
        if days_in_period <= 31:
            # For shorter periods, show each day
            date_format = '%d %b'
            axes[1].xaxis.set_major_locator(mdates.DayLocator(interval=1))
        elif days_in_period <= 90:
            # For medium periods, show every week
            date_format = '%d %b'
            axes[1].xaxis.set_major_locator(mdates.WeekdayLocator(interval=1, byweekday=0))  # Monday
        else:
            # For longer periods, show every month
            date_format = '%b %Y'
            axes[1].xaxis.set_major_locator(mdates.MonthLocator())
        
        axes[1].xaxis.set_major_formatter(mdates.DateFormatter(date_format))
        plt.xticks(rotation=45)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save figure
        filename = f"{output_dir}/{station_name}_{period_start.strftime('%Y%m')}_to_{period_end.strftime('%Y%m')}.png"
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"    Saved {period_name} plot to {filename}")
        
        plt.close(fig)

def get_station_metadata(station_name, river_name):
    """Try to get metadata for a station if available"""
    # This would ideally come from your database, but we can also try to
    # extract it from existing data files if you have them stored somewhere
    try:
        # Check if we have a metadata file or use hardcoded values for known stations
        if river_name == 'kelani_ganga' and station_name == 'hanwella':
            return {
                'alert_level': 5.0,       # Example value
                'minor_flood_level': 7.0,  # Example value
                'major_flood_level': 8.0   # Example value
            }
        # Add more stations as needed
    except:
        pass
    
    return None

# Example usage for Kelani Ganga
visualize_river_data('kelani_ganga', period_months=12)

# To process all rivers in your data directory:
def process_all_rivers():
    rivers_dir = "../data/rivers/"
    rivers = [d for d in os.listdir(rivers_dir) if os.path.isdir(os.path.join(rivers_dir, d))]
    
    print(f"Found {len(rivers)} rivers to process")
    for river in rivers:
        visualize_river_data(river, period_months=12)

# Uncomment to process all rivers
# process_all_rivers()

Processing Kelani Ganga data...
  Processing glencourse station...
    Water level data: 2467 records from 2021-03-27 to 2025-03-25
    Rainfall data: 1237 records from 2021-03-27 to 2025-03-25
    Saved Mar 2021 - Feb 2022 plot to ./visualizations/kelani_ganga/glencourse_202103_to_202202.png
    Saved Mar 2022 - Feb 2023 plot to ./visualizations/kelani_ganga/glencourse_202203_to_202302.png
    Saved Mar 2023 - Feb 2024 plot to ./visualizations/kelani_ganga/glencourse_202303_to_202402.png
    Saved Mar 2024 - Feb 2025 plot to ./visualizations/kelani_ganga/glencourse_202403_to_202502.png
    Saved Mar 2025 - Mar 2025 plot to ./visualizations/kelani_ganga/glencourse_202503_to_202503.png
  Processing hanwella station...
    Water level data: 2467 records from 2021-03-27 to 2025-03-25
    Rainfall data: 1237 records from 2021-03-27 to 2025-03-25
    Saved Mar 2021 - Feb 2022 plot to ./visualizations/kelani_ganga/hanwella_202103_to_202202.png
    Saved Mar 2022 - Feb 2023 plot to ./visualiz