In [140]:
import netCDF4

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.patches as mpatches

import numpy as np
import pandas as pd

import geopandas as gpd
import pycaret

from datetime import timezone
UTC = timezone.utc

from tqdm import tqdm
from shapely.geometry import Point
from tqdm import tqdm
from shapely.geometry import Point
from sklearn.decomposition import PCA
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [141]:
world = gpd.read_file("./data/land_vs_water_country_borders/ne_110m_admin_0_countries.shp")

In [142]:
surface_type_dict = {
    -1:"Ocean",
    0:"NaN",
    1:"Artifical",
    2:"Barely vegetated",
    3:"Inland water",
    4:"Crop",
    5:"Grass",
    6:"Shrub",
    7:"Forest"
}

ddm_antennas = {
    0: 'None',
    1: 'Zenith',
    2: 'LHCP',
    3: 'RHCP',
}

In [None]:

def add_seconds(time, seconds):
    timestamp = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
    new_timestamp = timestamp + timedelta(seconds=seconds)
    return new_timestamp.strftime("%Y-%m-%d %H:%M:%S")

def is_land(lat, lon):
    """Restituisce True se il punto è su terra, False se è in mare/oceano"""
    point = Point(lon, lat)  # Geopandas usa (lon, lat), non (lat, lon)
    return any(world.contains(point))

def check_ocean_and_land(lst):
    has_ocean = -1 in lst
    has_land = any(1 <= num <= 7 for num in lst)
    #if has_ocean and has_land:
    #    print(f"Sample {lst} has at least one ocean and one land point")
    return has_ocean and has_land

def fill_and_filter(arr):
    mask_all_nan = np.all(np.isnan(arr), axis=(2, 3))  # Shape (N, M), True se la matrice è tutta NaN

    arr_filled = arr.copy()  # Facciamo una copia per non modificare l'originale

    for i in range(arr.shape[0]):  # Iteriamo sui campioni
        nan_indices = np.where(mask_all_nan[i])[0]  # Indici delle matrici M completamente NaN
        if len(nan_indices) > 0:  
            # Troviamo una matrice valida da cui prendere la media
            valid_indices = np.where(~mask_all_nan[i])[0]  
            if len(valid_indices) > 0:
                mean_matrix = np.nanmean(arr[i, valid_indices, :, :], axis=0)  # Media sulle matrici valide
                arr_filled[i, nan_indices, :, :] = mean_matrix  # Riempimento
            # Se tutte le matrici M di un campione sono NaN, lo segneremo per l'eliminazione

    # Identificare i campioni da eliminare (se TUTTE le matrici M sono NaN)
    mask_discard = np.all(mask_all_nan, axis=1)  
    arr_filtered = arr_filled[~mask_discard]  # Manteniamo solo i campioni validi

    return arr_filtered, list(np.where(mask_discard.astype(int) == 1)[0])

In [144]:
def preprocess(f):
    
    raw_counts = f.variables['raw_counts']
    raw_counts = np.array(raw_counts)

    # Calcolo distanza tra il punto speculare e l'aereo
    ac_alt_2d = np.repeat(np.array(f.variables['ac_alt'])[:, np.newaxis], 20, axis=1)
    distance_2d = (ac_alt_2d - f.variables['sp_alt'][:]) / np.cos(np.deg2rad(f.variables['sp_inc_angle'][:]))

    # Seleziona gli indici dove sp_rx_gain_copol > 5, sp_rx_gain_xpol > 5 e ddm_snr > 0 e distanza tra punto speculare e antenna > 2000 e < 10000
    copol = f.variables['sp_rx_gain_copol'][:]
    xpol = f.variables['sp_rx_gain_xpol'][:]
    snr = f.variables['ddm_snr'][:]
    dist = distance_2d[:]

    keep_mask = (copol >= 5) & (xpol >= 5) & (snr > 0) & ((dist >= 2000) & (dist <= 10000)) & (~np.isnan(copol.data) & ~np.isnan(xpol.data) & ~np.isnan(snr.data) & ~np.isnan(dist.data))
    to_keep_indices = np.argwhere(keep_mask)
    #discard_indices =  np.argwhere(~keep_mask)


    filtered_raw_counts = [raw_counts[i, j] for i, j in to_keep_indices]
    #filtered_raw_counts_arr = np.array(filtered_raw_counts)

    output_array = np.full(raw_counts.shape, np.nan, dtype=np.float32)

    # Inserisci i dati filtrati nelle posizioni di to_keep_indices
    for idx, (i, j) in enumerate(to_keep_indices):
        output_array[i, j] = filtered_raw_counts[idx]

    raw_counts_filtered = output_array.copy()

    ddm_data_dict = {
        #'L1a Power DDM': l1a_power_ddm,
        #'BRCS': brcs,
        'Raw_Counts': raw_counts_filtered.reshape(raw_counts_filtered.shape[0]*raw_counts_filtered.shape[1], raw_counts_filtered.shape[2], raw_counts_filtered.shape[3]),
        #'Effective Scatter': eff_scatter,
        #'Surface Reflectivity': surface_reflectivity
    }
    keep_indices = np.where(
        np.all(~np.isnan(ddm_data_dict['Raw_Counts']), axis=(1, 2)) & (np.sum(ddm_data_dict['Raw_Counts'], axis=(1, 2)) > 0)
    )[0] # remove nan ddms and ddms which are all zeros (esp. for raw counts)
    fit_data = np.array([ddm_data_dict['Raw_Counts'][f].ravel() for f in keep_indices])

    surface_types = f.variables["sp_surface_type"][:]
    surface_types = np.nan_to_num(surface_types, nan=0)

    surface_types_unravelled = surface_types.ravel()

    # Land vs Water labels (same for all variables)
    label_data = surface_types_unravelled
    label_data = [label_data[l] for l in range(len(label_data)) if l in keep_indices]

    return fit_data, label_data

In [145]:
import os
ROOT_DIR = './sample_data/'
netcdf_file_list = os.listdir(ROOT_DIR)

f = netCDF4.Dataset(f'{ROOT_DIR}{netcdf_file_list[3]}')

In [146]:
fit_data, label_data = preprocess(f)

In [147]:
fit_data.shape

(4777, 200)

In [148]:
full_data = []
full_labels = []
for file_name in tqdm(netcdf_file_list, desc="Processing files"):
    if not file_name.endswith('.nc'):
        continue
    f = netCDF4.Dataset(f'{ROOT_DIR}{file_name}')
    data, labels = preprocess(f)
    full_data.append(data)
    full_labels.append(labels)
    #print(f"Processed {file_name}")

Processing files: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it]


In [149]:
full_data = np.vstack(full_data)
full_labels = np.hstack(full_labels)

In [150]:
full_data.shape, full_labels.shape

((54244, 200), (54244,))

In [151]:
model_search = True

import os
os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"

if model_search:
    scaler = MinMaxScaler()
    #fit_data_scaled, discard_indices = fill_and_filter(ddm_data_dict['Raw_Counts'])
    #fit_data_scaled = fit_data_scaled.reshape(fit_data_scaled.shape[0], -1)
    fit_data_scaled = scaler.fit_transform(full_data)


    # Land vs Water labels (same for all variables)
    #label_data = [1 if is_land(lat, lon) else 0 for lat, lon in zip(lats, lons)]
    #label_data = [label_data[l] for l in range(len(label_data)) if l not in discard_indices]


    from pycaret.classification import *
    clf = setup(data=fit_data_scaled, 
            target=full_labels, 
            pca=True,
            pca_method='incremental',
            use_gpu=True    
            )             

    # Confronto di vari modelli
    best_models = compare_models(n_select=5)  # Seleziona i 5 migliori modelli

    # Estrazione del modello migliore
    best_model = best_models[0]
    print(f"Il modello migliore è: {best_model}")

    # Tuning degli iperparametri per il modello migliore
    tuned_model = tune_model(best_model, 
                            optimize='Accuracy',     
                            n_iter=10,          
                            search_library='optuna', 
                            search_algorithm='tpe', 
                            choose_better=True)    

    print("Valutazione del modello ottimizzato:")
    evaluate_model(tuned_model)

    # Stampa i parametri migliori del modello
    best_params = tuned_model.get_params()
    print("Migliori iperparametri trovati:")
    for param, value in best_params.items():
        print(f"{param}: {value}")

    # Finalizzazione del modello
    final_model = finalize_model(tuned_model)

    # Salvataggio del modello
    save_model(final_model, 'best_classification_model')

    # Per caricare il modello in futuro:
    # loaded_model = load_model('best_classification_model')

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A500 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A500 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.00000

Unnamed: 0,Description,Value
0,Session id,5286
1,Target,target
2,Target type,Multiclass
3,Target mapping,"-1.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 5.0: 5, 6.0: 6, 7.0: 7"
4,Original data shape,"(54244, 201)"
5,Transformed data shape,"(54244, 201)"
6,Transformed train set shape,"(37970, 201)"
7,Transformed test set shape,"(16274, 201)"
8,Numeric features,200
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A500 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A500 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.00000

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.944,0.8449,0.944,0.9298,0.9344,0.6459,0.6538,8.983
lr,Logistic Regression,0.9408,0.0,0.9408,0.9177,0.9263,0.6083,0.6228,13.643
rf,Random Forest Classifier,0.9393,0.8676,0.9393,0.9185,0.9235,0.577,0.6016,23.347
svm,SVM - Linear Kernel,0.9356,0.0,0.9356,0.9086,0.9182,0.5447,0.5717,7.402
qda,Quadratic Discriminant Analysis,0.9275,0.0,0.9275,0.9031,0.9128,0.5012,0.5199,8.426
dt,Decision Tree Classifier,0.9151,0.7632,0.9151,0.9178,0.9164,0.5342,0.5343,22.923
ridge,Ridge Classifier,0.9142,0.0,0.9142,0.8725,0.8826,0.2357,0.3251,7.208
ada,Ada Boost Classifier,0.8904,0.0,0.8904,0.8564,0.8689,0.1936,0.2062,57.442
nb,Naive Bayes,0.8564,0.8411,0.8564,0.9146,0.8803,0.3404,0.3525,6.79


Processing:   0%|          | 0/73 [00:00<?, ?it/s]

KeyboardInterrupt: 