In [None]:
@echo off
setlocal enabledelayedexpansion

REM ====================================================================
REM Automated Chlorophyll Pipeline Runner with Enhanced Logging
REM ====================================================================

REM Set paths - CHANGE THESE IF NEEDED
set SCRIPT_DIR=C:\Users\23755118\OneDrive - UWA\Documents\PhD_Varshani\CODING\chl_time
set SCRIPT_NAME=daily_chl_pipeline.py
set LOG_FILE=%SCRIPT_DIR%\automation_log.txt
set ERROR_LOG=%SCRIPT_DIR%\automation_errors.txt

REM Python path - UPDATE THIS WITH YOUR ACTUAL PYTHON PATH
REM Based on the error log, you're using Miniconda with py3_13 environment
REM Uncomment and use ONE of these options:
REM 

REM Option 1: If using Miniconda with py3_13 environment (RECOMMENDED FOR YOU)
set CONDA_PATH=C:\Users\23755118\AppData\Local\miniconda3
set CONDA_ENV=py3_13
set PYTHON_EXE=%CONDA_PATH%\envs\%CONDA_ENV%\python.exe

REM Option 2: If using base conda environment
REM set CONDA_PATH=C:\Users\23755118\AppData\Local\miniconda3
REM set PYTHON_EXE=%CONDA_PATH%\python.exe

REM Option 3: If Python is in PATH (currently not working for you)
REM set PYTHON_EXE=python

REM Change to script directory
cd /d "%SCRIPT_DIR%"
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: Could not change to directory %SCRIPT_DIR% >> "%ERROR_LOG%"
    exit /b 1
)

REM Log start time
echo ================================================ >> "%LOG_FILE%"
echo Task started at: %date% %time% >> "%LOG_FILE%"
echo Directory: %CD% >> "%LOG_FILE%"
echo ================================================ >> "%LOG_FILE%"
echo. >> "%LOG_FILE%"

REM Check if Python is available
echo Checking Python... >> "%LOG_FILE%"
"%PYTHON_EXE%" --version >> "%LOG_FILE%" 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: Python not found at: %PYTHON_EXE% >> "%ERROR_LOG%"
    echo ERROR: Python not found at: %date% %time% >> "%LOG_FILE%"
    echo Tried to use: %PYTHON_EXE% >> "%LOG_FILE%"
    exit /b 1
)
echo Python found successfully >> "%LOG_FILE%"

REM Check if script exists
if not exist "%SCRIPT_NAME%" (
    echo ERROR: Script %SCRIPT_NAME% not found in %CD% >> "%ERROR_LOG%"
    echo ERROR: Script not found at: %date% %time% >> "%LOG_FILE%"
    exit /b 1
)

REM Activate conda environment if needed
REM Uncomment and modify these lines if you're using conda:
REM echo Activating conda environment... >> "%LOG_FILE%"
REM call "%CONDA_PATH%\Scripts\activate.bat" "%CONDA_ENV%" >> "%LOG_FILE%" 2>&1
REM if %ERRORLEVEL% NEQ 0 (
REM     echo ERROR: Failed to activate conda environment >> "%ERROR_LOG%"
REM     exit /b 1
REM )

REM Run Python script and capture output
echo Running Python script... >> "%LOG_FILE%"
"%PYTHON_EXE%" "%SCRIPT_NAME%" >> "%LOG_FILE%" 2>> "%ERROR_LOG%"

REM Capture the exit code
set SCRIPT_EXIT_CODE=%ERRORLEVEL%

REM Log completion status
echo. >> "%LOG_FILE%"
echo ================================================ >> "%LOG_FILE%"
if %SCRIPT_EXIT_CODE% EQU 0 (
    echo SUCCESS: Task completed at: %date% %time% >> "%LOG_FILE%"
    echo Exit Code: %SCRIPT_EXIT_CODE% >> "%LOG_FILE%"
) else (
    echo FAILURE: Task FAILED at: %date% %time% >> "%LOG_FILE%"
    echo Exit Code: %SCRIPT_EXIT_CODE% >> "%LOG_FILE%"
    echo Check %ERROR_LOG% for error details >> "%LOG_FILE%"
    echo ================================================ >> "%ERROR_LOG%"
    echo Task failed with exit code %SCRIPT_EXIT_CODE% at %date% %time% >> "%ERROR_LOG%"
    echo ================================================ >> "%ERROR_LOG%"
)
echo ================================================ >> "%LOG_FILE%"
echo. >> "%LOG_FILE%"
echo. >> "%LOG_FILE%"

REM Keep log files manageable (keep last 1000 lines)
if exist "%LOG_FILE%.tmp" del "%LOG_FILE%.tmp"
powershell -Command "Get-Content '%LOG_FILE%' -Tail 1000 | Set-Content '%LOG_FILE%.tmp'" 2>nul
if exist "%LOG_FILE%.tmp" (
    move /y "%LOG_FILE%.tmp" "%LOG_FILE%" >nul
)

endlocal
exit /b %SCRIPT_EXIT_CODE%

hhh

In [None]:
#!/usr/bin/env python3
"""
daily_chl_pipeline.py

Daily Sentinel-3 OLCI (S3A/S3B) chlorophyll monitoring pipeline with gap detection.
"""

import os
import glob
import time
import shutil
import subprocess
from datetime import datetime, timedelta
import logging

import earthaccess
import xarray as xr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

# ---------- CONFIG ----------
BBOX = (115.4, -32.65, 115.8, -31.70)
TARGET_LAT = -32.20085
TARGET_LON = 115.77047

DOWNLOAD_DIR = r"C:/Users/23755118/OneDrive - UWA/Documents/PhD_Varshani/CODING/chl_time"
STORE_CSV = os.path.join(DOWNLOAD_DIR, "chl_timeseries.csv")
PLOT_DIR = os.path.join(DOWNLOAD_DIR, "plots")
GAP_REPORT = os.path.join(DOWNLOAD_DIR, "gap_report.csv")
KEEP_DAYS = 30

INITIAL_BULK_DOWNLOAD = True
BULK_START_DATE = "2025-10-01"
BULK_END_DATE = "2025-10-16"

ENABLE_GAP_DETECTION = True
MAX_GAPS_TO_FILL = 5
GAP_CHECK_START_DATE = "2025-10-01"

ENABLE_DRIVE_UPLOAD = False
DRIVE_FOLDER_ID = "***"

ENABLE_GIT_PUSH = True
GIT_REPO_PATH = DOWNLOAD_DIR
GIT_COMMIT_MESSAGE_TEMPLATE = "Auto-update chlorophyll data: {date}"
# ----------------------------

logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(levelname)s: %(message)s")
logger = logging.getLogger("chl_pipeline")


def get_yesterday_str(utc=True):
    if utc:
        ref = datetime.utcnow()
    else:
        ref = datetime.now()
    yesterday = ref - timedelta(days=1)
    return yesterday.strftime("%Y-%m-%d")


def earthdata_login_check():
    try:
        logger.info("Logging in to Earthdata via earthaccess...")
        session = earthaccess.login()
        logger.info("Earthdata login OK.")
        return session
    except Exception as e:
        logger.exception("Earthdata login failed. Check credentials (.netrc) and network.")
        raise


def check_satellite_data_availability(date_str, bbox, short_names=None):
    if short_names is None:
        short_names = ["OLCIS3A_L2_EFR_OC_NRT", "OLCIS3B_L2_EFR_OC_NRT"]
    
    try:
        earthdata_login_check()
        total_count = 0
        granule_info = []
        
        for sat in short_names:
            try:
                results = earthaccess.search_data(
                    short_name=sat,
                    temporal=(date_str, date_str),
                    bounding_box=bbox
                )
                count = len(results) if results else 0
                total_count += count
                if count > 0:
                    granule_info.append({"satellite": sat, "count": count})
                logger.info(f"  {sat}: {count} granules found for {date_str}")
            except Exception as e:
                logger.warning(f"Error checking {sat} for {date_str}: {e}")
        
        return total_count > 0, total_count, granule_info
    except Exception as e:
        logger.error(f"Failed to check data availability for {date_str}: {e}")
        return False, 0, []


def identify_missing_days(csv_path, start_date_str, end_date_str=None):
    if not os.path.exists(csv_path):
        logger.info("No existing CSV found.")
        return pd.DataFrame()
    
    df = pd.read_csv(csv_path, parse_dates=["date"])
    df = df.dropna(subset=["date"])
    
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str) if end_date_str else datetime.utcnow()
    
    full_range = pd.date_range(start=start_date, end=end_date, freq='D')
    existing_dates = set(df['date'].dt.date)
    all_dates = set(full_range.date)
    missing_dates = sorted(all_dates - existing_dates)
    
    if missing_dates:
        logger.info(f"Found {len(missing_dates)} missing days in timeseries")
        return pd.DataFrame({'date': missing_dates, 'status': 'missing', 'checked': pd.NaT})
    else:
        logger.info("No missing days found")
        return pd.DataFrame()


def update_gap_report(gap_df, date_str, available, count, status="checked"):
    if not os.path.exists(GAP_REPORT):
        gap_df = pd.DataFrame(columns=['date', 'status', 'available', 'granule_count', 'last_checked'])
    else:
        gap_df = pd.read_csv(GAP_REPORT, parse_dates=['date', 'last_checked'])
    
    date_obj = pd.to_datetime(date_str)
    mask = gap_df['date'] == date_obj
    
    if mask.any():
        gap_df.loc[mask, 'status'] = status
        gap_df.loc[mask, 'available'] = available
        gap_df.loc[mask, 'granule_count'] = count
        gap_df.loc[mask, 'last_checked'] = datetime.utcnow()
    else:
        new_row = pd.DataFrame({
            'date': [date_obj],
            'status': [status],
            'available': [available],
            'granule_count': [count],
            'last_checked': [datetime.utcnow()]
        })
        gap_df = pd.concat([gap_df, new_row], ignore_index=True)
    
    gap_df = gap_df.sort_values('date')
    gap_df.to_csv(GAP_REPORT, index=False, date_format='%Y-%m-%d')
    return gap_df


def check_and_fill_gaps(csv_path, bbox, download_dir, max_gaps=5):
    logger.info("=== Starting gap detection and filling ===")
    
    missing_df = identify_missing_days(csv_path, GAP_CHECK_START_DATE)
    if missing_df.empty:
        logger.info("No gaps to fill")
        return 0
    
    if os.path.exists(GAP_REPORT):
        gap_report = pd.read_csv(GAP_REPORT, parse_dates=['date', 'last_checked'])
    else:
        gap_report = pd.DataFrame()
    
    gaps_filled = 0
    gaps_checked = 0
    
    for idx, row in missing_df.head(max_gaps).iterrows():
        date_str = row['date'].strftime('%Y-%m-%d')
        logger.info(f"\n--- Checking gap for {date_str} ---")
        
        if not gap_report.empty:
            recent_check = gap_report[
                (gap_report['date'] == pd.to_datetime(date_str)) &
                (gap_report['last_checked'] > datetime.utcnow() - timedelta(hours=24))
            ]
            if not recent_check.empty and recent_check.iloc[0]['available'] == False:
                logger.info(f"  Skipping {date_str} - checked recently, no data")
                continue
        
        available, count, granule_info = check_satellite_data_availability(date_str, bbox)
        gaps_checked += 1
        
        if available:
            logger.info(f"  Data available for {date_str}! Downloading...")
            update_gap_report(gap_report, date_str, True, count, status="available")
            
            files = fetch_daily_files(date_str, bbox, download_dir)
            
            if files:
                logger.info(f"  Downloaded {len(files)} files")
                df = process_downloaded_files(files, TARGET_LAT, TARGET_LON, csv_path)
                
                if os.path.exists(csv_path):
                    verify_df = pd.read_csv(csv_path, parse_dates=['date'])
                    if pd.to_datetime(date_str) in verify_df['date'].values:
                        logger.info(f"  ✓ Successfully filled gap for {date_str}")
                        update_gap_report(gap_report, date_str, True, count, status="filled")
                        gaps_filled += 1
                    else:
                        logger.warning(f"  ✗ No data extracted for {date_str}")
                        update_gap_report(gap_report, date_str, True, count, status="no_extraction")
            else:
                logger.warning(f"  ✗ Download failed for {date_str}")
                update_gap_report(gap_report, date_str, True, count, status="download_failed")
        else:
            logger.info(f"  No data available for {date_str}")
            update_gap_report(gap_report, date_str, False, 0, status="unavailable")
        
        time.sleep(2)
    
    logger.info(f"\n=== Gap filling complete: {gaps_filled} filled, {gaps_checked} checked ===")
    return gaps_filled


def fetch_date_range_files(start_date, end_date, bbox, download_dir, short_names=None):
    if short_names is None:
        short_names = ["OLCIS3A_L2_EFR_OC_NRT", "OLCIS3B_L2_EFR_OC_NRT"]

    os.makedirs(download_dir, exist_ok=True)
    downloaded_files = []
    earthdata_login_check()

    for sat in short_names:
        try:
            logger.info(f"Searching {sat} for {start_date} to {end_date}...")
            results = earthaccess.search_data(
                short_name=sat,
                temporal=(start_date, end_date),
                bounding_box=bbox
            )
            if not results:
                logger.info(f"No results for {sat}")
                continue

            logger.info(f"Found {len(results)} items. Downloading...")
            try:
                earthaccess.download(results, download_dir)
            except TypeError:
                earthaccess.download(results, path=download_dir)

            time.sleep(2)
            new_files = glob.glob(os.path.join(download_dir, "*.nc"))
            downloaded_files.extend(new_files)
        except Exception as e:
            logger.exception(f"Error with {sat}: {e}")

    downloaded_files = sorted(set([os.path.abspath(p) for p in downloaded_files]))
    logger.info(f"Total files: {len(downloaded_files)}")
    return downloaded_files


def fetch_daily_files(date_str, bbox, download_dir, short_names=None):
    if short_names is None:
        short_names = ["OLCIS3A_L2_EFR_OC_NRT", "OLCIS3B_L2_EFR_OC_NRT"]

    os.makedirs(download_dir, exist_ok=True)
    downloaded_files = []
    earthdata_login_check()

    for sat in short_names:
        try:
            logger.info(f"Searching {sat} for {date_str}...")
            results = earthaccess.search_data(
                short_name=sat,
                temporal=(date_str, date_str),
                bounding_box=bbox
            )
            if not results:
                logger.info(f"No results for {sat}")
                continue

            logger.info(f"Found {len(results)} items. Downloading...")
            try:
                earthaccess.download(results, download_dir)
            except TypeError:
                earthaccess.download(results, path=download_dir)

            time.sleep(1)
            new_files = glob.glob(os.path.join(download_dir, "*.nc"))
            downloaded_files.extend(new_files)
        except Exception as e:
            logger.exception(f"Error with {sat}: {e}")

    downloaded_files = sorted(set([os.path.abspath(p) for p in downloaded_files]))
    return downloaded_files


def extract_nearest_3x3_satellite_2d(dataset, target_lat, target_lon, var_name='chlor_a'):
    """
    Extract mean chlor_a around target lat/lon.
    First tries 3x3 pixels, if all NaN then tries 4x4 (16 pixels).
    """
    try:
        lat_coords = dataset.latitude.values
        lon_coords = dataset.longitude.values
        data_values = dataset[var_name].values

        if lat_coords.shape == data_values.shape:
            distances = np.sqrt((lat_coords - target_lat) ** 2 + (lon_coords - target_lon) ** 2)
            min_idx = np.unravel_index(np.nanargmin(distances), distances.shape)
            line_idx, pixel_idx = min_idx
            
            # Try 3x3 first
            line_start = max(0, line_idx - 1)
            line_end = min(data_values.shape[0], line_idx + 2)
            pixel_start = max(0, pixel_idx - 1)
            pixel_end = min(data_values.shape[1], pixel_idx + 2)
            region_3x3 = data_values[line_start:line_end, pixel_start:pixel_end]
            
            chl_mean_3x3 = np.nanmean(region_3x3)
            
            # If 3x3 has valid data, return it
            if np.isfinite(chl_mean_3x3):
                logger.debug(f"3x3 extraction successful: {chl_mean_3x3:.4f}")
                return float(chl_mean_3x3)
            
            # Otherwise try 4x4 (16 pixels)
            logger.debug("3x3 all NaN, trying 4x4...")
            line_start_4x4 = max(0, line_idx - 2)
            line_end_4x4 = min(data_values.shape[0], line_idx + 2)
            pixel_start_4x4 = max(0, pixel_idx - 2)
            pixel_end_4x4 = min(data_values.shape[1], pixel_idx + 2)
            region_4x4 = data_values[line_start_4x4:line_end_4x4, pixel_start_4x4:pixel_end_4x4]
            
            chl_mean_4x4 = np.nanmean(region_4x4)
            if np.isfinite(chl_mean_4x4):
                logger.info(f"4x4 extraction successful: {chl_mean_4x4:.4f} (3x3 was all NaN)")
                return float(chl_mean_4x4)
            else:
                logger.warning("Both 3x3 and 4x4 extraction returned NaN")
                return np.nan
                
        else:
            # coords subsampled => create coordinate pairs and find nearest point
            coord_points = np.column_stack([lat_coords.ravel(), lon_coords.ravel()])
            target_point = np.array([[target_lat, target_lon]])
            distances = np.sqrt(np.sum((coord_points - target_point) ** 2, axis=1))
            nearest_coord_idx = np.nanargmin(distances)
            coord_line_idx, coord_pixel_idx = np.unravel_index(nearest_coord_idx, lat_coords.shape)
            scale_line = int(round(data_values.shape[0] / lat_coords.shape[0]))
            scale_pixel = int(round(data_values.shape[1] / lat_coords.shape[1]))
            data_line_idx = min(data_values.shape[0] - 1, coord_line_idx * max(1, scale_line))
            data_pixel_idx = min(data_values.shape[1] - 1, coord_pixel_idx * max(1, scale_pixel))
            
            # Try 3x3 first
            line_start = max(0, data_line_idx - 1)
            line_end = min(data_values.shape[0], data_line_idx + 2)
            pixel_start = max(0, data_pixel_idx - 1)
            pixel_end = min(data_values.shape[1], data_pixel_idx + 2)
            region_3x3 = data_values[line_start:line_end, pixel_start:pixel_end]
            
            chl_mean_3x3 = np.nanmean(region_3x3)
            
            if np.isfinite(chl_mean_3x3):
                logger.debug(f"3x3 extraction (scaled) successful: {chl_mean_3x3:.4f}")
                return float(chl_mean_3x3)
            
            # Try 4x4
            logger.debug("3x3 (scaled) all NaN, trying 4x4...")
            line_start_4x4 = max(0, data_line_idx - 2)
            line_end_4x4 = min(data_values.shape[0], data_line_idx + 2)
            pixel_start_4x4 = max(0, data_pixel_idx - 2)
            pixel_end_4x4 = min(data_values.shape[1], data_pixel_idx + 2)
            region_4x4 = data_values[line_start_4x4:line_end_4x4, pixel_start_4x4:pixel_end_4x4]
            
            chl_mean_4x4 = np.nanmean(region_4x4)
            if np.isfinite(chl_mean_4x4):
                logger.info(f"4x4 extraction (scaled) successful: {chl_mean_4x4:.4f}")
                return float(chl_mean_4x4)
            else:
                logger.warning("Both 3x3 and 4x4 (scaled) returned NaN")
                return np.nan

    except Exception as e:
        logger.exception(f"Failed to extract region: {e}")
        return np.nan


def process_downloaded_files(files, target_lat, target_lon, store_csv):
    """
    Process list of .nc files, extract chl mean for each file, and append to CSV store.
    Files may include many days; function will extract date from filename if possible.
    """
    if not files:
        logger.warning("No files provided to process_downloaded_files")
        if os.path.exists(store_csv):
            return pd.read_csv(store_csv, parse_dates=["date"])
        else:
            return pd.DataFrame(columns=["date", "chlor_a"])
    
    rows = []
    for fpath in files:
        try:
            fname = os.path.basename(fpath)
            date_str = None
            
            # Extract date from filename
            for token in fname.replace(".", "_").split("_"):
                if len(token) >= 8 and token[:8].isdigit():
                    candidate = token[:8]
                    try:
                        dt = datetime.strptime(candidate, "%Y%m%d")
                        date_str = dt.strftime("%Y-%m-%d")
                        break
                    except Exception:
                        continue

            if date_str is None:
                logger.warning(f"Could not extract date from filename: {fname}, using yesterday")
                date_str = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")

            # Open dataset
            try:
                datatree = xr.open_datatree(fpath)
                dataset = xr.merge(datatree.to_dict().values())
            except Exception:
                dataset = xr.open_dataset(fpath)

            chl_mean = extract_nearest_3x3_satellite_2d(dataset, target_lat, target_lon, var_name='chlor_a')
            
            # Close datasets
            try:
                dataset.close()
            except Exception:
                pass
            try:
                datatree.close()
            except Exception:
                pass

            # Only add rows with valid chlorophyll data
            if np.isfinite(chl_mean):
                rows.append((date_str, chl_mean, fname))
                logger.info(f"Processed {fname} -> {date_str}, chl={chl_mean:.4f}")
            else:
                logger.warning(f"Processed {fname} -> {date_str}, chl=NaN (no valid data in 3x3 or 4x4)")
                
        except Exception as e:
            logger.exception(f"Error processing {fpath}: {e}")
    
    # Build DataFrame and append to CSV (deduplicate by date)
    if rows:
        df_new = pd.DataFrame(rows, columns=["date", "chlor_a", "source_file"])
        df_new["date"] = pd.to_datetime(df_new["date"], format="%Y-%m-%d", errors="coerce")
        df_new = df_new.dropna(subset=["date"])
        df_new = df_new.sort_values("date")
        df_new = df_new.drop_duplicates(subset=["date"], keep="last")

        logger.info(f"Extracted {len(df_new)} valid records from {len(files)} files")

        # Load existing CSV
        if os.path.exists(store_csv):
            df_old = pd.read_csv(store_csv, parse_dates=["date"])
            logger.info(f"Existing CSV has {len(df_old)} records")
            df_merged = pd.concat([df_old, df_new[["date", "chlor_a"]]])
            df_merged = df_merged.drop_duplicates(subset=["date"], keep="last")
            df_merged = df_merged.sort_values("date")
            logger.info(f"After merge: {len(df_merged)} total records")
        else:
            df_merged = df_new[["date", "chlor_a"]].copy()
            logger.info(f"Creating new CSV with {len(df_merged)} records")

        # Save to CSV
        df_merged.to_csv(store_csv, index=False, date_format="%Y-%m-%d")
        logger.info(f"✓ CSV saved: {store_csv} ({len(df_merged)} rows)")
        
        # Verify CSV was written
        if os.path.exists(store_csv):
            verify_df = pd.read_csv(store_csv)
            logger.info(f"✓ CSV verified: {len(verify_df)} rows on disk")
        else:
            logger.error(f"✗ CSV file not found after save: {store_csv}")
        
        return df_merged
    else:
        logger.warning("No valid rows extracted from files (all NaN or errors)")
        # Return existing dataframe or empty df
        if os.path.exists(store_csv):
            return pd.read_csv(store_csv, parse_dates=["date"])
        else:
            return pd.DataFrame(columns=["date", "chlor_a"])


def generate_plots(csv_path, out_dir):
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(csv_path, parse_dates=["date"])
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values("date").dropna(subset=["chlor_a"])
    
    if df.empty:
        logger.warning("No data for plotting")
        return {}

    latest_date = df['date'].max()
    out_paths = {}

    # 5-day plot
    start_5d = latest_date - timedelta(days=4)
    df_5d = df[df["date"].between(start_5d, latest_date)]

    # Always create the figure with fixed x-axis range
    fig_5d = go.Figure()
    
    if not df_5d.empty:
        df_5d_normal = df_5d[df_5d["chlor_a"] <= 5]
        df_5d_high = df_5d[df_5d["chlor_a"] > 5]
        
        for idx, row in df_5d_normal.iterrows():
            fig_5d.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#90EE90", width=2),
                showlegend=False, hoverinfo="skip"
            ))
        
        for idx, row in df_5d_high.iterrows():
            fig_5d.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#FF6B6B", width=2),
                showlegend=False, hoverinfo="skip"
            ))
        
        if not df_5d_normal.empty:
            fig_5d.add_trace(go.Scatter(
                x=df_5d_normal["date"], y=df_5d_normal["chlor_a"],
                mode="markers", name="Normal (≤5 mg/m³)",
                marker=dict(size=12, color="#90EE90", line=dict(color="#228B22", width=2)),
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³<extra></extra>"
            ))
        
        if not df_5d_high.empty:
            fig_5d.add_trace(go.Scatter(
                x=df_5d_high["date"], y=df_5d_high["chlor_a"],
                mode="markers+text", name="High (>5 mg/m³)",
                marker=dict(size=16, color="#FF6B6B", line=dict(color="#8B0000", width=2)),
                text="?", textfont=dict(size=10, color="white", family="Arial Black"),
                textposition="middle center",
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³ ⚠️<extra></extra>"
            ))
    else:
        # Add annotation if no data
        fig_5d.add_annotation(
            text="No data available for this period",
            xref="paper", yref="paper",
            x=0.5, y=0.5, showarrow=False,
            font=dict(size=14, color="gray")
        )

    fig_5d.update_layout(
        title=f"5-Day Chlorophyll Trend (as of {latest_date.date()})",
        xaxis_title="Date", yaxis_title="Chlorophyll-a (mg/m³)",
        hovermode="x unified", template="plotly_white",
        height=500, width=900, showlegend=True,
        xaxis=dict(range=[start_5d, latest_date])  # Fixed x-axis range
    )

    p5_html = os.path.join(out_dir, f"chl_5day_{latest_date.date()}.html")
    p5_png = os.path.join(out_dir, f"chl_5day_{latest_date.date()}.png")
    pio.write_html(fig_5d, file=p5_html, include_plotlyjs='cdn')
        
    try:
        fig_5d.write_image(p5_png, width=900, height=500)
    except Exception as e:
        logger.warning(f"PNG save failed: {e}")
        
    out_paths["5day_html"] = p5_html
    out_paths["5day_png"] = p5_png
    logger.info(f"5-day plot saved: {p5_html}")

    # Monthly plot
    month_start = latest_date.replace(day=1)
    df_month = df[df["date"].between(month_start, latest_date)]

    # Always create the figure with fixed x-axis range
    fig_month = go.Figure()
    
    if not df_month.empty:
        df_month_normal = df_month[df_month["chlor_a"] <= 5]
        df_month_high = df_month[df_month["chlor_a"] > 5]
        
        for idx, row in df_month_normal.iterrows():
            fig_month.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#90EE90", width=2),
                showlegend=False, hoverinfo="skip"
            ))
        
        for idx, row in df_month_high.iterrows():
            fig_month.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#FF6B6B", width=2),
                showlegend=False, hoverinfo="skip"
            ))
        
        if not df_month_normal.empty:
            fig_month.add_trace(go.Scatter(
                x=df_month_normal["date"], y=df_month_normal["chlor_a"],
                mode="markers", name="Normal (≤5 mg/m³)",
                marker=dict(size=10, color="#90EE90", line=dict(color="#228B22", width=2)),
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³<extra></extra>"
            ))
        
        if not df_month_high.empty:
            fig_month.add_trace(go.Scatter(
                x=df_month_high["date"], y=df_month_high["chlor_a"],
                mode="markers+text", name="High (>5 mg/m³)",
                marker=dict(size=14, color="#FF6B6B", line=dict(color="#8B0000", width=2)),
                text="?", textfont=dict(size=9, color="white", family="Arial Black"),
                textposition="middle center",
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³ ⚠️<extra></extra>"
            ))
    else:
        # Add annotation if no data
        fig_month.add_annotation(
            text="No data available for this period",
            xref="paper", yref="paper",
            x=0.5, y=0.5, showarrow=False,
            font=dict(size=14, color="gray")
        )

    fig_month.update_layout(
        title=f"{latest_date.strftime('%B %Y')} Chlorophyll Trend",
        xaxis_title="Date", yaxis_title="Chlorophyll-a (mg/m³)",
        hovermode="x unified", template="plotly_white",
        height=500, width=1000, showlegend=True,
        xaxis=dict(range=[month_start, latest_date])  # Fixed x-axis range
    )

    pmonth_html = os.path.join(out_dir, f"chl_month_{latest_date.strftime('%Y-%m')}.html")
    pmonth_png = os.path.join(out_dir, f"chl_month_{latest_date.strftime('%Y-%m')}.png")
    pio.write_html(fig_month, file=pmonth_html, include_plotlyjs='cdn')
        
    try:
        fig_month.write_image(pmonth_png, width=1000, height=500)
    except Exception as e:
        logger.warning(f"PNG save failed: {e}")
        
    out_paths["month_html"] = pmonth_html
    out_paths["month_png"] = pmonth_png
    logger.info(f"Monthly plot saved: {pmonth_html}")

    # Full interactive timeseries
    if len(df) > 0:
        fig_full = go.Figure()

        df_normal = df[df["chlor_a"] <= 5]
        df_high = df[df["chlor_a"] > 5]
        
        for idx, row in df_normal.iterrows():
            fig_full.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#90EE90", width=1.5),
                showlegend=False, hoverinfo="skip"
            ))
        
        for idx, row in df_high.iterrows():
            fig_full.add_trace(go.Scatter(
                x=[row["date"], row["date"]], y=[0, row["chlor_a"]],
                mode="lines", line=dict(color="#FF6B6B", width=1.5),
                showlegend=False, hoverinfo="skip"
            ))
        
        if not df_normal.empty:
            fig_full.add_trace(go.Scatter(
                x=df_normal["date"], y=df_normal["chlor_a"],
                mode="markers", name="Normal (≤5 mg/m³)",
                marker=dict(size=8, color="#90EE90", line=dict(color="#228B22", width=1.5)),
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³<extra></extra>"
            ))
        
        if not df_high.empty:
            fig_full.add_trace(go.Scatter(
                x=df_high["date"], y=df_high["chlor_a"],
                mode="markers+text", name="High (>5 mg/m³)",
                marker=dict(size=12, color="#FF6B6B", line=dict(color="#8B0000", width=1.5)),
                text="?", textfont=dict(size=8, color="white", family="Arial Black"),
                textposition="middle center",
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Chl-a: %{y:.4f} mg/m³ ⚠️<extra></extra>"
            ))

        # Set x-axis range from start of data to current date
        data_start = df['date'].min()
        current_date = pd.Timestamp(datetime.utcnow().date())
        
        fig_full.update_layout(
            title="Interactive Chlorophyll Timeseries - Full Dataset",
            xaxis_title="Date", yaxis_title="Chlorophyll-a (mg/m³)",
            hovermode="x unified", template="plotly_white",
            height=600, width=1200,
            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
            xaxis=dict(range=[data_start, current_date])  # Fixed range to current date
        )

        fig_full.update_xaxes(
            rangeselector=dict(buttons=[
                dict(count=7, label="7d", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=3, label="3m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all")
            ]),
            rangeslider=dict(visible=True),
            type="date"
        )

        pfull = os.path.join(out_dir, "chlorophyll_timeseries_interactive.html")
        pio.write_html(fig_full, file=pfull, include_plotlyjs='cdn')
        out_paths["full_interactive"] = pfull
        logger.info(f"Full plot saved: {pfull}")

    return out_paths


def upload_to_google_drive(local_path, folder_id):
    try:
        from pydrive2.auth import GoogleAuth
        from pydrive2.drive import GoogleDrive
    except ImportError:
        logger.error("pydrive2 not installed")
        return None

    try:
        gauth = GoogleAuth()
        gauth.LocalWebserverAuth()
        drive = GoogleDrive(gauth)
        file_drive = drive.CreateFile({'title': os.path.basename(local_path),
                                       'parents': [{'id': folder_id}]})
        file_drive.SetContentFile(local_path)
        file_drive.Upload()
        link = file_drive.get('alternateLink')
        logger.info(f"Uploaded: {link}")
        return link
    except Exception as e:
        logger.exception(f"Upload failed: {e}")
        return None


def cleanup_old_files(directory, keep_days=30):
    cutoff = datetime.utcnow() - timedelta(days=keep_days)
    removed = 0
    for f in glob.glob(os.path.join(directory, "*")):
        if f.endswith('.nc'):
            continue
            
        try:
            mtime = datetime.utcfromtimestamp(os.path.getmtime(f))
            if mtime < cutoff:
                if os.path.isdir(f):
                    shutil.rmtree(f)
                else:
                    os.remove(f)
                removed += 1
        except Exception:
            logger.exception(f"Failed cleaning {f}")
    logger.info(f"Cleanup done. Removed {removed} files (excluding .nc)")


def git_commit_and_push(repo_path, files_to_add=None, commit_message=None):
    try:
        original_dir = os.getcwd()
        os.chdir(repo_path)
        
        result = subprocess.run(
            ["git", "rev-parse", "--is-inside-work-tree"],
            capture_output=True, text=True, timeout=10
        )
        if result.returncode != 0:
            logger.error(f"Not a git repository: {repo_path}")
            os.chdir(original_dir)
            return False
        
        subprocess.run(["git", "config", "user.email", "automated@pipeline.local"],
                      capture_output=True, timeout=10)
        subprocess.run(["git", "config", "user.name", "Automated Pipeline"],
                      capture_output=True, timeout=10)
        
        if files_to_add is None:
            subprocess.run(["git", "add", "-A"], capture_output=True, timeout=30)
        else:
            for file in files_to_add:
                subprocess.run(["git", "add", file], capture_output=True, timeout=10)
        
        result = subprocess.run(
            ["git", "status", "--porcelain"],
            capture_output=True, text=True, timeout=10
        )
        
        if not result.stdout.strip():
            logger.info("No changes to commit")
            os.chdir(original_dir)
            return True
        
        if commit_message is None:
            commit_message = f"Auto-update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        
        result = subprocess.run(
            ["git", "commit", "-m", commit_message],
            capture_output=True, text=True, timeout=30
        )
        
        if result.returncode != 0:
            logger.error(f"Git commit failed: {result.stderr}")
            os.chdir(original_dir)
            return False
        
        logger.info(f"Git commit successful: {commit_message}")
        
        result = subprocess.run(
            ["git", "rev-parse", "--abbrev-ref", "HEAD"],
            capture_output=True, text=True, timeout=10
        )
        
        if result.returncode != 0:
            logger.error("Could not determine branch")
            os.chdir(original_dir)
            return False
        
        current_branch = result.stdout.strip()
        logger.info(f"Current branch: {current_branch}")
        
        result = subprocess.run(
            ["git", "push", "origin", current_branch],
            capture_output=True, text=True, timeout=60
        )
        
        if result.returncode != 0:
            logger.warning(f"Push failed, trying with -u: {result.stderr}")
            result = subprocess.run(
                ["git", "push", "-u", "origin", current_branch],
                capture_output=True, text=True, timeout=60
            )
            
            if result.returncode != 0:
                logger.error(f"Git push failed: {result.stderr}")
                logger.warning("Committed locally but push failed")
                os.chdir(original_dir)
                return True
        
        logger.info(f"Git push successful to {current_branch}")
        os.chdir(original_dir)
        return True
        
    except subprocess.TimeoutExpired:
        logger.error("Git operation timed out")
        try:
            os.chdir(original_dir)
        except Exception:
            pass
        return False
    except Exception as e:
        logger.exception(f"Git operation failed: {e}")
        try:
            os.chdir(original_dir)
        except Exception:
            pass
        return False


def check_first_run():
    if not os.path.exists(STORE_CSV):
        return True
    try:
        df = pd.read_csv(STORE_CSV)
        return len(df) == 0
    except Exception:
        return True


def print_gap_summary():
    if not os.path.exists(GAP_REPORT):
        logger.info("No gap report available")
        return
    
    try:
        gap_df = pd.read_csv(GAP_REPORT, parse_dates=['date', 'last_checked'])
        
        if gap_df.empty:
            logger.info("No gaps recorded")
            return
        
        logger.info("\n" + "="*60)
        logger.info("GAP SUMMARY REPORT")
        logger.info("="*60)
        
        status_counts = gap_df['status'].value_counts()
        logger.info(f"\nGap Status Distribution:")
        for status, count in status_counts.items():
            logger.info(f"  {status}: {count}")
        
        unfilled = gap_df[gap_df['status'].isin(['unavailable', 'missing', 'download_failed'])]
        if not unfilled.empty:
            logger.info(f"\nUnfilled gaps: {len(unfilled)}")
            logger.info(f"  Date range: {unfilled['date'].min().date()} to {unfilled['date'].max().date()}")
        
        recent = gap_df[gap_df['last_checked'] > datetime.utcnow() - timedelta(hours=24)]
        if not recent.empty:
            logger.info(f"\nGaps checked in last 24 hours: {len(recent)}")
        
        logger.info("="*60 + "\n")
        
    except Exception as e:
        logger.error(f"Error printing gap summary: {e}")


def main():
    logger.info("=== Starting daily chlorophyll pipeline ===")
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    os.makedirs(PLOT_DIR, exist_ok=True)

    is_first_run = check_first_run()
    
    if INITIAL_BULK_DOWNLOAD and is_first_run:
        logger.info("=" * 60)
        logger.info("FIRST RUN: Performing initial bulk download")
        logger.info(f"Downloading data from {BULK_START_DATE} to {BULK_END_DATE}")
        logger.info("=" * 60)
        
        files = fetch_date_range_files(BULK_START_DATE, BULK_END_DATE, BBOX, DOWNLOAD_DIR)
        
        if not files:
            logger.warning("No files downloaded during bulk download")
        else:
            logger.info(f"Bulk download complete. Processing {len(files)} files...")
            df = process_downloaded_files(files, TARGET_LAT, TARGET_LON, STORE_CSV)
            logger.info(f"Initial CSV populated with {len(df)} records")
            
        logger.info("=" * 60)
        logger.info("IMPORTANT: Set INITIAL_BULK_DOWNLOAD = False in CONFIG")
        logger.info("for subsequent runs to enable daily mode")
        logger.info("=" * 60)
    else:
        date_str = get_yesterday_str(utc=True)
        logger.info(f"Daily mode: Searching for data for {date_str}")

        files = fetch_daily_files(date_str, BBOX, DOWNLOAD_DIR)
        if not files:
            logger.info("No files downloaded for yesterday. Checking availability...")
            available, count, _ = check_satellite_data_availability(date_str, BBOX)
            if not available:
                logger.info(f"No satellite data available for {date_str}")
                update_gap_report(pd.DataFrame(), date_str, False, 0, status="unavailable")
            else:
                logger.warning(f"Data available but download failed for {date_str}")
                update_gap_report(pd.DataFrame(), date_str, True, count, status="download_failed")
        else:
            df = process_downloaded_files(files, TARGET_LAT, TARGET_LON, STORE_CSV)
            update_gap_report(pd.DataFrame(), date_str, True, len(files), status="filled")

    if ENABLE_GAP_DETECTION and not (INITIAL_BULK_DOWNLOAD and is_first_run):
        logger.info("\n" + "="*60)
        logger.info("STARTING GAP DETECTION")
        logger.info("="*60)
        gaps_filled = check_and_fill_gaps(STORE_CSV, BBOX, DOWNLOAD_DIR, MAX_GAPS_TO_FILL)
        logger.info(f"Gap detection complete: {gaps_filled} gaps filled")
        print_gap_summary()

    if os.path.exists(STORE_CSV):
        df = pd.read_csv(STORE_CSV, parse_dates=["date"])
        if not df.empty:
            out_paths = generate_plots(STORE_CSV, PLOT_DIR)

            if ENABLE_DRIVE_UPLOAD and out_paths and "month_html" in out_paths:
                link = upload_to_google_drive(out_paths["month_html"], DRIVE_FOLDER_ID)
                if link:
                    logger.info(f"Monthly plot uploaded: {link}")

    if ENABLE_GIT_PUSH:
        logger.info("Attempting to commit and push changes to Git...")
        date_str = get_yesterday_str(utc=True) if not (INITIAL_BULK_DOWNLOAD and is_first_run) else "bulk-download"
        commit_msg = GIT_COMMIT_MESSAGE_TEMPLATE.format(date=date_str)
        
        success = git_commit_and_push(GIT_REPO_PATH, files_to_add=None, commit_message=commit_msg)
        if success:
            logger.info("Git commit and push completed successfully")
        else:
            logger.warning("Git commit/push had issues - check logs")

    cleanup_old_files(DOWNLOAD_DIR, KEEP_DAYS)

    logger.info("=== Pipeline finished ===")


if __name__ == "__main__":
    main()