In [None]:
import netCDF4
import numpy as np
import pandas as pd
import geopandas as gpd
import pycaret
from datetime import timezone, datetime, timedelta
from tqdm import tqdm
from shapely.geometry import Point
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from pycaret.classification import *


class SurfaceTypeUtils:
    surface_type_dict = {
        -1: "Ocean",
        0: "NaN",
        1: "Artifical",
        2: "Barely vegetated",
        3: "Inland water",
        4: "Crop",
        5: "Grass",
        6: "Shrub",
        7: "Forest"
    }
    ddm_antennas = {
        0: 'None',
        1: 'Zenith',
        2: 'LHCP',
        3: 'RHCP',
    }


In [None]:
world_shapefile_path = gpd.read_file('D:/GREAT/machine_learning/data/land_vs_water_country_borders/ne_110m_admin_0_countries.shp')

In [None]:
class GeoUtils:
    def __init__(self, world_shapefile_path):
        self.world = gpd.read_file(world_shapefile_path)

    @staticmethod
    def add_seconds(time, seconds):
        timestamp = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        new_timestamp = timestamp + timedelta(seconds=seconds)
        return new_timestamp.strftime("%Y-%m-%d %H:%M:%S")

    def is_land(self, lat, lon):
        point = Point(lon, lat)
        return any(self.world.contains(point))

    @staticmethod
    def check_ocean_and_land(lst):
        has_ocean = -1 in lst
        has_land = any(1 <= num <= 7 for num in lst)
        return has_ocean and has_land

    @staticmethod
    def fill_and_filter(arr):
        mask_all_nan = np.all(np.isnan(arr), axis=(2, 3))
        arr_filled = arr.copy()
        for i in range(arr.shape[0]):
            nan_indices = np.where(mask_all_nan[i])[0]
            if len(nan_indices) > 0:
                valid_indices = np.where(~mask_all_nan[i])[0]
                if len(valid_indices) > 0:
                    mean_matrix = np.nanmean(arr[i, valid_indices, :, :], axis=0)
                    arr_filled[i, nan_indices, :, :] = mean_matrix
        mask_discard = np.all(mask_all_nan, axis=1)
        arr_filtered = arr_filled[~mask_discard]
        return arr_filtered, list(np.where(mask_discard.astype(int) == 1)[0])

In [None]:

class GeoHelper:
    def __init__(self, world):
        self.world = world

    @staticmethod
    def add_seconds(time, seconds):
        timestamp = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        new_timestamp = timestamp + timedelta(seconds=seconds)
        return new_timestamp.strftime("%Y-%m-%d %H:%M:%S")

    @staticmethod
    def fill_and_filter(arr):
        mask_all_nan = np.all(np.isnan(arr), axis=(2, 3))
        arr_filled = arr.copy()
        for i in range(arr.shape[0]):
            nan_indices = np.where(mask_all_nan[i])[0]
            if len(nan_indices) > 0:
                valid_indices = np.where(~mask_all_nan[i])[0]
                if len(valid_indices) > 0:
                    mean_matrix = np.nanmean(arr[i, valid_indices, :, :], axis=0)
                    arr_filled[i, nan_indices, :, :] = mean_matrix
        mask_discard = np.all(mask_all_nan, axis=1)
        arr_filtered = arr_filled[~mask_discard]
        return arr_filtered, list(np.where(mask_discard.astype(int) == 1)[0])


In [None]:
class NetCDFPreprocessor:

    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.netcdf_file_list = os.listdir(root_dir)

    def preprocess(self, f):
        
        raw_counts = f.variables['raw_counts']
        raw_counts = np.array(raw_counts)

        # Calcolo distanza tra il punto speculare e l'aereo
        ac_alt_2d = np.repeat(np.array(f.variables['ac_alt'])[:, np.newaxis], 20, axis=1)
        distance_2d = (ac_alt_2d - f.variables['sp_alt'][:]) / np.cos(np.deg2rad(f.variables['sp_inc_angle'][:]))

        # Seleziona gli indici dove sp_rx_gain_copol > 5, sp_rx_gain_xpol > 5 e ddm_snr > 0 e distanza tra punto speculare e antenna > 2000 e < 10000
        copol = f.variables['sp_rx_gain_copol'][:]
        xpol = f.variables['sp_rx_gain_xpol'][:]
        snr = f.variables['ddm_snr'][:]
        dist = distance_2d[:]

        keep_mask = (copol >= 5) & (xpol >= 5) & (snr > 0) & ((dist >= 2000) & (dist <= 10000)) & (~np.isnan(copol.data) & ~np.isnan(xpol.data) & ~np.isnan(snr.data) & ~np.isnan(dist.data))
        to_keep_indices = np.argwhere(keep_mask)
        #discard_indices =  np.argwhere(~keep_mask)


        filtered_raw_counts = [raw_counts[i, j] for i, j in to_keep_indices]
        #filtered_raw_counts_arr = np.array(filtered_raw_counts)

        output_array = np.full(raw_counts.shape, np.nan, dtype=np.float32)

        # Inserisci i dati filtrati nelle posizioni di to_keep_indices
        for idx, (i, j) in enumerate(to_keep_indices):
            output_array[i, j] = filtered_raw_counts[idx]

        raw_counts_filtered = output_array.copy()

        ddm_data_dict = {
            'Raw_Counts': raw_counts_filtered.reshape(raw_counts_filtered.shape[0]*raw_counts_filtered.shape[1], raw_counts_filtered.shape[2], raw_counts_filtered.shape[3]),
        }
        keep_indices = np.where(
            np.all(~np.isnan(ddm_data_dict['Raw_Counts']), axis=(1, 2)) & (np.sum(ddm_data_dict['Raw_Counts'], axis=(1, 2)) > 0)
        )[0]
        fit_data = np.array([ddm_data_dict['Raw_Counts'][f].ravel() for f in keep_indices])
        surface_types = f.variables["sp_surface_type"][:]
        surface_types = np.nan_to_num(surface_types, nan=0)
        surface_types_unravelled = surface_types.ravel()

        label_data = surface_types_unravelled
        label_data = [label_data[l] for l in range(len(label_data)) if l in keep_indices]

        return fit_data, label_data

    def process_all_files(self, load_and_save_data = bool):
        if load_and_save_data:
            full_data = []
            full_labels = []
            for file_name in tqdm(self.netcdf_file_list, desc="Processing files"):
                if not file_name.endswith('.nc'):
                    continue
                f = netCDF4.Dataset(f'{self.root_dir}{file_name}')
                data, labels = self.preprocess(f)
                full_data.append(data)
                full_labels.append(labels)
            
            # Trova gli indici degli elementi di full_data con seconda dimensione uguale a 200
            valid_indices = [i for i, arr in enumerate(full_data) if arr.ndim == 2 if arr.shape[1] == 200]

            # Applica la selezione a full_data e full_labels
            full_data_clean = [full_data[i] for i in valid_indices]
            full_labels_clean = [full_labels[i] for i in valid_indices]

            full_data_clean_flat = np.vstack(full_data_clean)
            full_labels_clean_flat = np.hstack(full_labels_clean)

            import pyarrow as pa
            import pyarrow.parquet as pq

            # Crea la cartella processed_data se non esiste
            os.makedirs('processed_data', exist_ok=True)

            # Salva fit_data in formato parquet ottimizzato
            fit_data_df = pd.DataFrame(full_data_clean_flat)
            table_fit = pa.Table.from_pandas(fit_data_df, preserve_index=False)
            pq.write_table(
                table_fit,
                'processed_data/fit_data_multiclass.parquet',
                compression='zstd',
                use_dictionary=True,
                data_page_size=len(full_data_clean_flat)
            )
            del fit_data_df

            # Salva labels in formato parquet ottimizzato
            labels_df = pd.DataFrame(full_labels_clean_flat, columns=['label'])
            table_labels = pa.Table.from_pandas(labels_df, preserve_index=False)
            pq.write_table(
                table_labels,
                'processed_data/labels_multiclass.parquet',
                compression='zstd',
                use_dictionary=True,
                data_page_size=len(full_labels_clean_flat)
            )

            del labels_df
        else: 
            import polars as pl
            
            # Leggi i file parquet con polars
            full_data_pl = pl.read_parquet('processed_data/fit_data_multiclass.parquet')
            full_labels_pl = pl.read_parquet('processed_data/labels_multiclass.parquet')

            # Trasforma in numpy array
            full_data_clean_flat = full_data_pl.to_numpy()
            full_labels_clean_flat = full_labels_pl['label'].to_numpy()


        return full_data_clean_flat, full_labels_clean_flat
       
            
    
class ModelTrainer:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.final_model = None

    def train(self, model_search=True):
        os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"
        if model_search:
            scaler = MinMaxScaler()
            fit_data_scaled = scaler.fit_transform(self.data)
            clf = setup(data=fit_data_scaled,
                        target=self.labels,
                        pca=True,
                        pca_method='incremental',
                        use_gpu=True
                        )
            best_models = compare_models(n_select=5)
            best_model = best_models[0]
            print(f"Il modello migliore è: {best_model}")
            tuned_model = tune_model(best_model,
                                    optimize='Recall',
                                    n_iter=10,
                                    search_library='optuna',
                                    search_algorithm='tpe',
                                    choose_better=True)
            print("Valutazione del modello ottimizzato:")
            evaluate_model(tuned_model)
            best_params = tuned_model.get_params()
            print("Migliori iperparametri trovati:")
            for param, value in best_params.items():
                print(f"{param}: {value}")
            self.final_model = finalize_model(tuned_model)
            save_model(self.final_model, 'best_classification_model')
            # loaded_model = load_model('best_classification_model')

In [None]:
read_from_backup = False
if read_from_backup:
    import polars as pl

    # Leggi i file parquet con polars
    fit_data_pl = pl.read_parquet('processed_data/fit_data_multiclass.parquet')
    labels_pl = pl.read_parquet('processed_data/labels_multiclass.parquet')

    # Trasforma in numpy array
    fit_data = fit_data_pl.to_numpy()
    labels = labels_pl['label'].to_numpy()
else:
    preprocessor = NetCDFPreprocessor(root_dir='D:/GREAT/machine_learning/data/Rongowai/')
    fit_data, labels = preprocessor.process_all_files()

In [None]:
from sklearn.model_selection import train_test_split

# Esegui il campionamento stratificato, prendi 1-test_size%  del campione
fit_data_sample, _, labels_sample, _ = train_test_split(
    fit_data, labels, test_size=0.90, stratify=labels, random_state=42
)

In [None]:
len(fit_data_sample), len(labels_sample)

In [None]:

model_trainer = ModelTrainer(data=fit_data_sample, labels=labels_sample)
model_trainer.train(model_search=True)