'''#old version, runs way too long!
import ee
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from sklearn.linear_model import LinearRegression

# Initialize Earth Engine
# Define service account and key file path
SERVICE_ACCOUNT = ''
KEY_PATH = ''
# Authenticate and initialize Earth Engine API
EE_CREDENTIALS = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, KEY_PATH)
ee.Initialize(EE_CREDENTIALS)

# Load CSV file with soil sample data
# Adjust the path to your CSV file
data = pd.read_csv('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/date_location_lucas.csv')

# Define cloud masking function
def mask_clouds_sentinel2(image):
    scl = image.select('SCL')
    cloud_mask = scl.eq(3).Or(scl.eq(4)).Or(scl.eq(5)).Or(scl.eq(6))
    return image.updateMask(cloud_mask)

# Define function to calculate indices
def calculate_indices_sentinel2(image):
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    ndmi = image.normalizedDifference(['B8', 'B11']).rename('NDMI')
    bsi = image.expression(
        '((SWIR + RED) - (NIR + BLUE)) / ((SWIR + RED) + (NIR + BLUE))', {
            'SWIR': image.select('B11'),
            'RED': image.select('B4'),
            'NIR': image.select('B8'),
            'BLUE': image.select('B2')
        }).rename('BSI')
    soci = image.select('B2').divide(image.select('B3').multiply(image.select('B4'))).rename('SOCI')
    return image.addBands([ndvi, ndmi, bsi, soci])

# Define function to calculate trend (slope of the regression line)
def calculate_trend(values):
    if len(values) < 2 or any(v is None for v in values):
        return None
    x = np.arange(len(values)).reshape(-1, 1)
    y = np.array(values).reshape(-1, 1)
    model = LinearRegression().fit(x, y)
    return model.coef_[0][0]

# Function to fetch mean, std dev, and trend for indices over 5 years
def fetch_statistics(lat, lon, sample_date):
    # Parse date and define 5-year range
    end_date = datetime.strptime(sample_date, '%Y-%m-%d')
    start_date = end_date - timedelta(days=5*365)
    
    point = ee.Geometry.Point(lon, lat)
    sentinel2 = ee.ImageCollection('COPERNICUS/S2_SR') \
                .filterDate(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) \
                .filterBounds(point) \
                .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 60)) \
                .map(mask_clouds_sentinel2) \
                .map(calculate_indices_sentinel2) \
                .select(['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12', 
                         'NDVI', 'NDMI', 'BSI', 'SOCI'])  # Select specific bands to ensure consistency
    
    # Create a mean composite image of the collection
    composite = sentinel2.mean()
    
    indices = ['NDVI', 'NDMI', 'BSI', 'SOCI']
    stats = {}

    for index in indices:
        # Use reduceRegion on the composite image
        time_series = composite.select(index).reduceRegion(
            reducer=ee.Reducer.toList(),
            geometry=point.buffer(60),
            scale=10,
            maxPixels=1e6
        ).get(index).getInfo()
        
        # Calculate statistics
        mean_value = np.nanmean(time_series)
        stddev_value = np.nanstd(time_series)
        trend_value = calculate_trend(time_series)
        
        stats[f'{index}_mean'] = mean_value
        stats[f'{index}_stdDev'] = stddev_value
        stats[f'{index}_trend'] = trend_value
    
    return stats

# Process each row in the CSV file and append results
results = []
for _, row in data.iterrows():
    date = row['sample_date']
    lat = row['lat']
    lon = row['long']
    stats = fetch_statistics(lat, lon, date)
    stats['date'] = date
    stats['lat'] = lat
    stats['long'] = lon
    results.append(stats)

# Convert to DataFrame and save to CSV
df_results = pd.DataFrame(results)
df_results.to_csv('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/indices_lucas.csv', index=False)


In [13]:
#new version, higher resolution, more efficient cloud masks / threshold, quarterly instead of monthly and using batches to run faster
import ee
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from concurrent.futures import ProcessPoolExecutor

# Initialize Earth Engine
# Define service account and key file path
SERVICE_ACCOUNT = ''
KEY_PATH = ''
# Authenticate and initialize Earth Engine API
EE_CREDENTIALS = ee.ServiceAccountCredentials(SERVICE_ACCOUNT, KEY_PATH)
ee.Initialize(EE_CREDENTIALS)

# Cloud masking function
def mask_clouds_sentinel2(image):
    scl = image.select('SCL')
    cloud_mask = scl.eq(3).Or(scl.eq(4)).Or(scl.eq(5)).Or(scl.eq(6))
    return image.updateMask(cloud_mask)

# Indices calculation function
def calculate_indices_sentinel2(image):
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    ndmi = image.normalizedDifference(['B8', 'B11']).rename('NDMI')
    bsi = image.expression(
        '((SWIR + RED) - (NIR + BLUE)) / ((SWIR + RED) + (NIR + BLUE))', {
            'SWIR': image.select('B11'),
            'RED': image.select('B4'),
            'NIR': image.select('B8'),
            'BLUE': image.select('B2')
        }).rename('BSI')
    soci = image.select('B2').divide(image.select('B3').multiply(image.select('B4'))).rename('SOCI')
    return image.addBands([ndvi, ndmi, bsi, soci])

# Quarterly data retrieval function
def fetch_quarterly_composites(lat, lon, sample_date):
    end_date = datetime.strptime(sample_date, '%Y-%m-%d')
    start_date = end_date - timedelta(days=5*365)
    date_ranges = pd.date_range(start=start_date, end=end_date, freq='QS')
    
    point = ee.Geometry.Point(lon, lat)
    data = []
    
    for start in date_ranges:
        sentinel2 = (ee.ImageCollection('COPERNICUS/S2_SR')
                     .filterDate(start.strftime('%Y-%m-%d'), (start + pd.DateOffset(months=3)).strftime('%Y-%m-%d'))
                     .filterBounds(point)
                     .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 60))  # 60% cloud threshold
                     .map(mask_clouds_sentinel2)
                     .map(calculate_indices_sentinel2))
        
        # Quarterly mean composite
        quarterly_composite = sentinel2.mean()  # Mean composite

        # Sample the indices at the location
        result = quarterly_composite.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point.buffer(60),  # 60m buffer
            scale=30  # 30m resolution
        ).getInfo()
        
        if result:
            result['date'] = start.strftime('%Y-%m-%d')
            data.append(result)
    
    return pd.DataFrame(data)

# Function to process a single batch of locations
def process_batch(batch, batch_index):
    results = []
    for _, row in batch.iterrows():
        date = row['date']
        lat = row['lat']
        lon = row['long']
        quarterly_data = fetch_quarterly_composites(lat, lon, date)
        quarterly_data['lat'] = lat
        quarterly_data['long'] = lon
        results.append(quarterly_data)
    
    # Combine results for this batch and save to a separate file
    batch_df = pd.concat(results, ignore_index=True)
    batch_df.to_csv(f'/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/sentinel_2/batches/output_batch_{batch_index}.csv', index=False)
    print(f"Batch {batch_index} processed and saved.")

# Load the initial CSV with 19,000 locations and set up checkpointing
data = pd.read_csv('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/date_location_lucas.csv')
batch_size = 500
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Check if a checkpoint file exists to resume from a specific batch
try:
    with open("checkpoint.txt", "r") as f:
        start_batch = int(f.read().strip()) + 1  # Start from the next batch
except FileNotFoundError:
    start_batch = 0  # Start from the first batch if no checkpoint exists

# Run batches in parallel and save progress to checkpoint file after each batch
def run_batches_with_checkpointing():
    with ProcessPoolExecutor() as executor:
        for i, batch in enumerate(batches[start_batch:], start=start_batch):
            future = executor.submit(process_batch, batch, i)
            future.result()  # Wait for batch to complete
            
            # Save the completed batch index to the checkpoint file
            with open("checkpoint.txt", "w") as f:
                f.write(str(i))
            print(f"Checkpoint updated. Batch {i} completed.")

run_batches_with_checkpointing()

# Combine all batch files into a single CSV after processing is complete
all_batches = [pd.read_csv(f'/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/sentinel_2/batches/output_batch_{i}.csv') for i in range(len(batches))]
final_df = pd.concat(all_batches, ignore_index=True)
final_df.to_csv('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/sentinel_2/indices_lucas.csv', index=False)
print("All batches combined and saved to final_output.csv.")


Process SpawnProcess-1:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.12/concurrent/futures/process.py", line 251, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_batch' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.