In [125]:
import pandas as pd
import os
import glob

# Directory where the CSV files are located
csv_dir = '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices'  # Replace with your path

# List to store all dataframes
df_list = []

# Function to extract the identifier from the filename
def extract_identifier(filename):
    base_name = os.path.basename(filename).replace('.csv', '')  # Remove the .csv extension
    year = base_name.split('_')[-2]
    month = int(base_name.split('_')[-1])  # Convert month directly to int
    quarter = (month - 1) // 3 + 1  # Calculate the quarter
    return f"{year}_Q{quarter}"

# Read all CSV files, add the identifier column, and append to the list
for file in glob.glob(os.path.join(csv_dir, '*.csv')):
    df = pd.read_csv(file)
    df['quarter'] = extract_identifier(file)
    df_list.append(df)

# Concatenate all dataframes
final_df = pd.concat(df_list, axis=0, join='outer', ignore_index=True)

# Display the structure of the final DataFrame
final_df.head()


Unnamed: 0,system:index,BSI,NDMI,NDVI,SOCI,sample_date,.geo,quarter,lat,long
0,0,-0.020134,0.071468,0.085867,6.9e-05,2018-12-20,"{""type"":""Point"",""coordinates"":[22.72922449,45....",2018_Q1,,
1,1,-0.001685,0.063842,0.098104,6.1e-05,2018-12-19,"{""type"":""Point"",""coordinates"":[28.198854880000...",2018_Q1,,
2,2,-0.115698,0.341321,0.018842,5.1e-05,2018-12-19,"{""type"":""Point"",""coordinates"":[28.333738630000...",2018_Q1,,
3,3,-0.09499,0.273026,0.047869,5.6e-05,2018-12-19,"{""type"":""Point"",""coordinates"":[23.314151240000...",2018_Q1,,
4,4,0.011718,0.078319,0.130311,6.4e-05,2018-12-19,"{""type"":""Point"",""coordinates"":[28.181462420000...",2018_Q1,,


In [127]:
# Ensure .geo column is treated as a string and remove any extra whitespace
final_df['.geo'] = final_df['.geo'].astype(str).str.strip()

In [129]:
# Dictionary to store unique locations per file
location_counts = {}

# Check unique locations in each file
for file in glob.glob(os.path.join(csv_dir, '*.csv')):
    df = pd.read_csv(file)
    unique_locs = df['.geo'].astype(str).nunique()  # Unique locations in this file
    location_counts[file] = unique_locs

# Display the unique location count per file to find discrepancies
print("Unique location counts per file:")
print(location_counts)

Unique location counts per file:
{'/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2018_01.csv': 10931, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2016_10.csv': 10931, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2016_04.csv': 10931, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_01.csv': 18984, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2016_07.csv': 18984, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2018_07.csv': 10931, '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_07.csv': 18984, '/Users/maxson

In [133]:
import pandas as pd

# Load one of the correctly formatted files to get the standard 10,931 locations
reference_file = '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2018_01.csv'
reference_df = pd.read_csv(reference_file)
reference_df['.geo'] = reference_df['.geo'].astype(str).str.strip()  # Standardize `.geo` values
standard_locations = set(reference_df['.geo'].unique())

print(f"Standard number of unique locations: {len(standard_locations)}")  # Should be 10,931


Standard number of unique locations: 10931


In [137]:
import os

# List of affected files with extra locations
affected_files = [
    '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_01.csv',
    '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_04.csv',
    '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_07.csv',
    '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_10.csv',
    '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2016_07.csv'
]

# Process each affected file
for file in affected_files:
    # Load the file and standardize `.geo` values
    df = pd.read_csv(file)
    df['.geo'] = df['.geo'].astype(str).str.strip()
    
    # Filter to keep only standard locations
    df_filtered = df[df['.geo'].isin(standard_locations)]
    
    # Save the filtered data back to a new CSV file
    output_file = os.path.join('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/filtered', os.path.basename(file))
    df_filtered.to_csv(output_file, index=False)
    
    # Confirm the number of unique locations after filtering
    print(f"{file}: {df_filtered['.geo'].nunique()} unique locations (after filtering)")


/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_01.csv: 10931 unique locations (after filtering)
/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_04.csv: 10931 unique locations (after filtering)
/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_07.csv: 10931 unique locations (after filtering)
/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2014_10.csv: 10931 unique locations (after filtering)
/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2016_07.csv: 9893 unique locations (after filtering)


In [139]:
import pandas as pd
import os
import glob

# Directory where the CSV files are located
csv_dir = '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices' 

# List of files with higher-than-expected unique locations
affected_files = [
    'output_all_locations_2014_01.csv',
    'output_all_locations_2014_04.csv',
    'output_all_locations_2014_07.csv',
    'output_all_locations_2014_10.csv',
    'output_all_locations_2016_07.csv'
]

# Check for duplicates in each affected file
for file_name in affected_files:
    file_path = os.path.join(csv_dir, file_name)
    df = pd.read_csv(file_path)
    
    # Standardize `.geo` values
    df['.geo'] = df['.geo'].astype(str).str.strip()
    
    # Find duplicate `.geo` values
    duplicate_geo = df[df.duplicated(subset=['.geo'], keep=False)]
    
    # Check if duplicates are identical or contain differing values
    if not duplicate_geo.empty:
        identical_duplicates = duplicate_geo[duplicate_geo.duplicated(keep='first')]
        differing_duplicates = duplicate_geo.drop(identical_duplicates.index)
        
        print(f"\nFile: {file_name}")
        print(f"Total duplicates: {len(duplicate_geo['.geo'].unique())}")
        print(f"Identical duplicates: {len(identical_duplicates['.geo'].unique())}")
        print(f"Differing duplicates: {len(differing_duplicates['.geo'].unique())}")
        
        # Show sample of differing duplicates, if any
        if not differing_duplicates.empty:
            print("\nSample of differing duplicate entries:")
            print(differing_duplicates.groupby('.geo').head())
    else:
        print(f"\nFile: {file_name} has no duplicates.")



File: output_all_locations_2014_01.csv has no duplicates.

File: output_all_locations_2014_04.csv has no duplicates.

File: output_all_locations_2014_07.csv has no duplicates.

File: output_all_locations_2014_10.csv has no duplicates.

File: output_all_locations_2016_07.csv has no duplicates.


In [143]:
import pandas as pd
import os
import glob

# Directory where the CSV files are located (all non-affected files were also copied to the filtered folder)
csv_dir = '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/filtered'  # Adjust if needed

# Load the standard set of locations from a correctly formatted file
reference_file = '/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/quarterly_comps_indices/output_all_locations_2018_01.csv'
reference_df = pd.read_csv(reference_file)
reference_df['.geo'] = reference_df['.geo'].astype(str).str.strip()
standard_locations = set(reference_df['.geo'].unique())

# Function to extract the identifier from the filename
def extract_identifier(filename):
    base_name = os.path.basename(filename).replace('.csv', '')  # Remove the .csv extension
    year = base_name.split('_')[-2]
    month = int(base_name.split('_')[-1])  # Convert month directly to int
    quarter = (month - 1) // 3 + 1  # Calculate the quarter
    return f"{year}_Q{quarter}"

# List to store each DataFrame
df_list = []

# Read each file, add the quarter identifier, and append to the list
for file in glob.glob(os.path.join(csv_dir, '*.csv')):
    df = pd.read_csv(file)
    df['.geo'] = df['.geo'].astype(str).str.strip()  # Standardize `.geo` values
    df['quarter'] = extract_identifier(file)
    
    # Filter to include only standard locations
    df = df[df['.geo'].isin(standard_locations)]
    df_list.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(df_list, axis=0, join='outer', ignore_index=True)

# Check the shape and number of unique `.geo` entries after filtering
print(f"Total rows in concatenated data: {len(final_df)}")
print(f"Unique locations in concatenated data (after strict filtering): {final_df['.geo'].nunique()}")


Total rows in concatenated data: 214468
Unique locations in concatenated data (after strict filtering): 10931


In [145]:
#aggregate mean, std and trend
from scipy.stats import linregress

# Function to calculate trend across quarters
def calculate_trend(series):
    series = series.dropna()
    if len(series) > 1:
        x = range(len(series))
        slope, _, _, _, _ = linregress(x, series)
        return slope
    else:
        return None

# Group by location and calculate mean, std, and trend across quarters for each index
grouped = final_df.groupby('.geo').agg({
    'NDVI': ['mean', 'std', calculate_trend],
    'NDMI': ['mean', 'std', calculate_trend],
    'BSI': ['mean', 'std', calculate_trend],
    'SOCI': ['mean', 'std', calculate_trend]
})

# Flatten multi-index columns for easier access
grouped.columns = ['_'.join(col).replace('calculate_trend', 'trend') for col in grouped.columns.values]
agg_df = grouped.reset_index()

# Display the resulting DataFrame
print("Aggregated DataFrame:")
print(agg_df.head())
print("Missing values in the final aggregated DataFrame:")
print(agg_df.isna().sum())


Aggregated DataFrame:
                                                .geo  NDVI_mean  NDVI_std  \
0  {"type":"Point","coordinates":[-0.003159007999...   0.134840  0.054608   
1  {"type":"Point","coordinates":[-0.003219756,52...   0.239838  0.057186   
2  {"type":"Point","coordinates":[-0.004928459000...   0.233253  0.088611   
3  {"type":"Point","coordinates":[-0.006392204000...   0.295358  0.094453   
4  {"type":"Point","coordinates":[-0.011575600999...   0.234918  0.054999   

   NDVI_trend  NDMI_mean  NDMI_std  NDMI_trend  BSI_mean   BSI_std  BSI_trend  \
0   -0.002187  -0.022124  0.048395   -0.002115  0.059456  0.032955   0.001738   
1   -0.002217   0.132141  0.045441    0.001040 -0.063177  0.027550  -0.000641   
2    0.000396   0.099711  0.069891    0.002635 -0.037607  0.056264  -0.002425   
3    0.005872   0.186234  0.069075   -0.000386 -0.102043  0.048325  -0.000171   
4   -0.002956   0.132636  0.035401   -0.000529 -0.059274  0.025155   0.000839   

   SOCI_mean  SOCI_std    SO

In [151]:
import json
import pandas as pd

# Function to extract latitude and longitude from GeoJSON data
def extract_lat_long(geo_json_str):
    try:
        geo_data = json.loads(geo_json_str)
        # Assuming GeoJSON is of type Point: {"type": "Point", "coordinates": [longitude, latitude]}
        if geo_data['type'] == 'Point':
            longitude, latitude = geo_data['coordinates']
            return latitude, longitude
        else:
            return None, None  # In case it’s not a Point type
    except (json.JSONDecodeError, KeyError, TypeError):
        return None, None  # In case of any parsing issues

# Apply the function to extract lat and long and create new columns in agg_df
agg_df['lat'], agg_df['long'] = zip(*agg_df['.geo'].apply(extract_lat_long))

agg_df

Unnamed: 0,.geo,NDVI_mean,NDVI_std,NDVI_trend,NDMI_mean,NDMI_std,NDMI_trend,BSI_mean,BSI_std,BSI_trend,SOCI_mean,SOCI_std,SOCI_trend,lat,long
0,"{""type"":""Point"",""coordinates"":[-0.003159007999...",0.134840,0.054608,-0.002187,-0.022124,0.048395,-0.002115,0.059456,0.032955,0.001738,0.000059,0.000010,-5.068301e-07,41.411819,-0.003159
1,"{""type"":""Point"",""coordinates"":[-0.003219756,52...",0.239838,0.057186,-0.002217,0.132141,0.045441,0.001040,-0.063177,0.027550,-0.000641,0.000078,0.000015,-3.556697e-07,52.751181,-0.003220
2,"{""type"":""Point"",""coordinates"":[-0.004928459000...",0.233253,0.088611,0.000396,0.099711,0.069891,0.002635,-0.037607,0.056264,-0.002425,0.000079,0.000013,-4.581567e-07,42.011973,-0.004928
3,"{""type"":""Point"",""coordinates"":[-0.006392204000...",0.295358,0.094453,0.005872,0.186234,0.069075,-0.000386,-0.102043,0.048325,-0.000171,0.000077,0.000013,8.747049e-07,47.966599,-0.006392
4,"{""type"":""Point"",""coordinates"":[-0.011575600999...",0.234918,0.054999,-0.002956,0.132636,0.035401,-0.000529,-0.059274,0.025155,0.000839,0.000068,0.000009,-4.444866e-07,43.792581,-0.011576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10926,"{""type"":""Point"",""coordinates"":[9.984522829,54....",0.271837,0.117279,0.006997,0.206005,0.071890,-0.001006,-0.109503,0.031705,-0.001009,0.000074,0.000019,9.487893e-07,54.624020,9.984523
10927,"{""type"":""Point"",""coordinates"":[9.985930727,50....",0.208370,0.102867,-0.002781,0.174325,0.076662,0.003807,-0.080351,0.046170,-0.002301,0.000064,0.000022,-9.973288e-07,50.417982,9.985931
10928,"{""type"":""Point"",""coordinates"":[9.9866519580000...",0.313676,0.129995,-0.002506,0.232764,0.048592,0.000028,-0.139178,0.026784,0.000284,0.000076,0.000017,-2.862957e-07,47.827606,9.986652
10929,"{""type"":""Point"",""coordinates"":[9.986707895,47....",0.295762,0.132434,-0.000806,0.245628,0.067698,0.001888,-0.138162,0.025542,-0.000944,0.000074,0.000018,-3.367315e-07,47.611616,9.986708


In [167]:
del agg_df['.geo']

In [165]:
# Reset the index and drop the old index (this creates a default integer index)
agg_df = agg_df.reset_index(drop=True)

# Add `loc_id` column based on the current index
agg_df['loc_id'] = agg_df.index

# Set `loc_id` as the index
agg_df = agg_df.set_index('loc_id')

# Reorder columns to place `lat`, `long` at the front
columns_order = ['lat', 'long'] + [col for col in agg_df.columns if col not in ['lat', 'long']]
agg_df = agg_df[columns_order]

# Verify the structure
agg_df.head()

Unnamed: 0_level_0,lat,long,.geo,NDVI_mean,NDVI_std,NDVI_trend,NDMI_mean,NDMI_std,NDMI_trend,BSI_mean,BSI_std,BSI_trend,SOCI_mean,SOCI_std,SOCI_trend
loc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,41.411819,-0.003159,"{""type"":""Point"",""coordinates"":[-0.003159007999...",0.13484,0.054608,-0.002187,-0.022124,0.048395,-0.002115,0.059456,0.032955,0.001738,5.9e-05,1e-05,-5.068301e-07
1,52.751181,-0.00322,"{""type"":""Point"",""coordinates"":[-0.003219756,52...",0.239838,0.057186,-0.002217,0.132141,0.045441,0.00104,-0.063177,0.02755,-0.000641,7.8e-05,1.5e-05,-3.556697e-07
2,42.011973,-0.004928,"{""type"":""Point"",""coordinates"":[-0.004928459000...",0.233253,0.088611,0.000396,0.099711,0.069891,0.002635,-0.037607,0.056264,-0.002425,7.9e-05,1.3e-05,-4.581567e-07
3,47.966599,-0.006392,"{""type"":""Point"",""coordinates"":[-0.006392204000...",0.295358,0.094453,0.005872,0.186234,0.069075,-0.000386,-0.102043,0.048325,-0.000171,7.7e-05,1.3e-05,8.747049e-07
4,43.792581,-0.011576,"{""type"":""Point"",""coordinates"":[-0.011575600999...",0.234918,0.054999,-0.002956,0.132636,0.035401,-0.000529,-0.059274,0.025155,0.000839,6.8e-05,9e-06,-4.444866e-07


In [171]:
# Save the final dataframe to a CSV file
agg_df.to_csv('/Users/maxsonntag/Documents/GitHub/SOC_predictor/data/satellite_data/landsat_indices_agged.csv', index=False)