In [None]:
import pandas as pd
from pathlib import Path

In [3]:
stats_dir = Path("/dss/dsstbyfs02/pn49cu/pn49cu-dss-0006/Sen12Landslides/tests/stats")

In [34]:
SATELLITE = "s1dsc_raw"
df = pd.read_csv(stats_dir / f"{SATELLITE}_stats.csv")

In [35]:
def extract_inventory(file_path):
    """
    Split the file name (assumed to be the last part of the path) on underscores.
    Use the first item unless the first item is 'usa' (case-insensitive), then use the first two.
    """
    # Extract the file name from a path (if needed)
    file_name = file_path.split('/')[-1]  # adjust if your file col is just the name
    parts = file_name.split('_')
    if parts[0].lower() == 'usa' and len(parts) >= 2:
        return '_'.join(parts[:2])
    else:
        return parts[0]

In [36]:
df['inventory'] = df['file'].apply(extract_inventory)
df


Unnamed: 0,file,variable,mean,std,min,max,p1,p99,inventory
0,chimanimani_s1dsc_1000,VH,0.041324,0.023640,0.000092,0.313566,0.006729,0.117842,chimanimani
1,chimanimani_s1dsc_1000,VV,0.159791,0.089406,0.005678,1.000000,0.034633,0.459131,chimanimani
2,chimanimani_s1dsc_1000,DEM,1123.133900,38.414238,1031.000000,1195.000000,1041.000000,1192.000000,chimanimani
3,chimanimani_s1dsc_1001,VH,0.043825,0.025878,0.000362,0.447135,0.007489,0.129908,chimanimani
4,chimanimani_s1dsc_1001,VV,0.178488,0.105135,0.005028,1.000000,0.036030,0.534934,chimanimani
...,...,...,...,...,...,...,...,...,...
45847,newzealand_s1dsc_998,VV,0.103260,0.066750,0.001244,1.000000,0.017375,0.339504,newzealand
45848,newzealand_s1dsc_998,DEM,362.178280,41.772778,275.000000,481.000000,278.000000,453.000000,newzealand
45849,newzealand_s1dsc_999,VH,0.019839,0.016043,0.000005,0.228592,0.001036,0.076388,newzealand
45850,newzealand_s1dsc_999,VV,0.088335,0.068414,0.000342,1.000000,0.013462,0.343398,newzealand


In [37]:
inventory_without_dem = df[df['variable'] != 'DEM']

In [38]:
inventory_stats = inventory_without_dem.groupby('inventory').agg({
    'mean': ['mean', 'std'],
    'std': ['mean', 'std'],
    'min': ['min', 'max'],
    'max': ['min', 'max'],
    'p1': ['mean', 'std'],
    'p99': ['mean', 'std']
}).reset_index()

In [39]:
print("Inventory-level summary statistics:")
inventory_stats.to_csv(stats_dir / f"{SATELLITE}_inventory_stats.csv", index=False)
inventory_stats

Inventory-level summary statistics:


Unnamed: 0_level_0,inventory,mean,mean,std,std,min,min,max,max,p1,p1,p99,p99
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,min,max,min,max,mean,std,mean,std
0,chimanimani,0.095897,0.061477,0.060057,0.038726,0.0,0.015683,0.166256,1.0,0.017246,0.013752,0.306301,0.205084
1,china,0.13202,0.0878,0.123242,0.076178,0.0,0.011547,0.299035,1.0,0.009928,0.010716,0.604795,0.355944
2,hiroshima,0.102481,0.092835,0.085168,0.07095,0.0,0.013557,0.010011,1.0,0.013339,0.014452,0.419492,0.338242
3,hokkaido,0.097037,0.067278,0.075097,0.052122,0.0,0.007963,0.014235,1.0,0.013014,0.011678,0.387116,0.289095
4,indonesia,0.145263,0.092937,0.094904,0.061157,0.0,0.014655,0.019942,1.0,0.023762,0.016973,0.494112,0.330258
5,italy,0.095829,0.07126,0.072738,0.054727,0.0,0.019748,0.013607,1.0,0.014383,0.013183,0.366603,0.281429
6,itogon,0.125459,0.077554,0.098275,0.058779,0.0,0.012451,0.430003,1.0,0.015723,0.011239,0.511185,0.316913
7,kyrgyzstan1,0.076913,0.061694,0.071567,0.057333,0.0,0.013103,0.014337,1.0,0.008089,0.008937,0.371603,0.315768
8,kyrgyzstan2,0.065938,0.051834,0.066397,0.052491,0.0,0.006423,0.109252,1.0,0.005538,0.005919,0.348846,0.299678
9,lanaodelnorte,0.127843,0.103256,0.081167,0.06243,0.0,0.015526,0.021607,1.0,0.024069,0.02166,0.430386,0.342434


In [40]:
inventory_global = df.groupby('inventory')['mean'].agg(['mean', 'std']).reset_index()
inventory_global = inventory_global.rename(columns={'mean': 'global_mean', 'std': 'global_std'})

# Merge back with the original dataframe
df_merge = pd.merge(df, inventory_global, on='inventory', how='left')

# Now compute the z-score for each file's mean value.
df_merge['mean_zscore'] = (df_merge['mean'] - df_merge['global_mean']) / df_merge['global_std']

# Print rows where the z-score is above a threshold (say, abs(z) > 3)
anomalies = df_merge[df_merge['mean_zscore'].abs() > 3]
print("Anomalies based on mean z-scores (|z| > 3):")
anomalies.to_csv(stats_dir / f"{SATELLITE}_anomalies.csv", index=False)
anomalies

Anomalies based on mean z-scores (|z| > 3):


Unnamed: 0,file,variable,mean,std,min,max,p1,p99,inventory,global_mean,global_std,mean_zscore
2858,china_s1dsc_23,DEM,4326.4766,211.902910,3931.0,4790.0,3957.0,4767.0,china,711.786003,1179.313824,3.065079
2879,china_s1dsc_25,DEM,4411.4746,206.647600,3980.0,4828.0,4006.0,4807.0,china,711.786003,1179.313824,3.137154
3185,china_s1dsc_465,DEM,4292.7150,157.843220,3863.0,4641.0,3926.0,4583.0,china,711.786003,1179.313824,3.036451
3230,china_s1dsc_514,DEM,4275.9062,195.633100,3740.0,4724.0,3818.0,4670.0,china,711.786003,1179.313824,3.022198
3290,china_s1dsc_560,DEM,4619.1050,134.756330,4334.0,4906.0,4366.0,4892.0,china,711.786003,1179.313824,3.313214
...,...,...,...,...,...,...,...,...,...,...,...,...
45455,newzealand_s1dsc_2579,DEM,1274.2828,94.307570,1028.0,1470.0,1066.0,1456.0,newzealand,189.269261,321.052849,3.379548
45665,newzealand_s1dsc_2754,DEM,1201.2102,102.270584,941.0,1433.0,977.0,1400.0,newzealand,189.269261,321.052849,3.151945
45668,newzealand_s1dsc_2755,DEM,1249.9381,97.690315,994.0,1464.0,1019.0,1440.0,newzealand,189.269261,321.052849,3.303720
45821,newzealand_s1dsc_923,DEM,1286.0248,92.102270,1068.0,1492.0,1095.0,1476.0,newzealand,189.269261,321.052849,3.416121
