In [1]:
import netCDF4
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm import tqdm
from shapely.geometry import Point
from scipy.stats import skew, kurtosis, entropy
from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler
import os
from pycaret.classification import setup, compare_models, tune_model, finalize_model, save_model, plot_model, evaluate_model, dashboard, save_experiment, blend_models, get_config
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed

In [2]:
class DDMFeatureExtractor:
    def __init__(self):
        pass
    @staticmethod
    def gini(array):
            """Gini coefficient calculation"""
            array = np.sort(array)
            index = np.arange(1, array.shape[0] + 1)
            return (np.sum((2 * index - array.shape[0] - 1) * array)) / (array.shape[0] * np.sum(array))  
      
    def extract_ddm_features(self, fit_data: np.ndarray) -> pd.DataFrame:
        """
        Extract features from DDM data.
        """
        features = []

        for row in tqdm(fit_data, desc="Extracting DDM features"):
            f = {}
            x = np.array(row, dtype=np.float64) + 1e-10  # evita log(0)

            # 1. General statistics
            f['mean'] = np.mean(x)
            f['std'] = np.std(x)
            f['min'] = np.min(x)
            f['max'] = np.max(x)
            f['median'] = np.median(x)
            f['range'] = np.max(x) - np.min(x)
            f['skew'] = skew(x)
            f['kurtosis'] = kurtosis(x)
            f['entropy'] = entropy(x)
            f['gini'] = self.gini(x)

            # 2. Positional 
            f['peak_index'] = np.argmax(x)
            f['peak_value'] = np.max(x)
            f['center_of_mass'] = np.sum(np.arange(len(x)) * x) / np.sum(x)
            f['inertia'] = np.sum(((np.arange(len(x)) - f['center_of_mass'])**2) * x)

            # 3. Segmentations in thirds
            thirds = np.array_split(x, 3)
            for i, part in enumerate(thirds):
                f[f'sum_third_{i+1}'] = np.sum(part)
                f[f'mean_third_{i+1}'] = np.mean(part)
                f[f'max_third_{i+1}'] = np.max(part)

            # 3.1 Segmentations in windows of 5
            windows = np.array_split(x, 5)
            for i, w in enumerate(windows):
                f[f'mean_w{i+1}'] = np.mean(w)
                f[f'std_w{i+1}'] = np.std(w)
                f[f'max_w{i+1}'] = np.max(w)

            # 4. Derivative statistics and differences
            dx = np.diff(x)
            f['mean_diff'] = np.mean(dx)
            f['std_diff'] = np.std(dx)
            f['max_diff'] = np.max(dx)
            f['min_diff'] = np.min(dx)
            f['n_positive_diff'] = np.sum(dx > 0)
            f['n_negative_diff'] = np.sum(dx < 0)
            f['n_zero_diff'] = np.sum(dx == 0)

            # 5. Autocorrelations (lag 1-3)
            for lag in range(1, 4):
                ac = np.corrcoef(x[:-lag], x[lag:])[0, 1] if len(x) > lag else np.nan
                f[f'autocorr_lag{lag}'] = ac

            # 6. FFT 
            spectrum = np.abs(fft(x)) # type: ignore
            half_spectrum = spectrum[:len(spectrum)//2]  
            f['fft_peak_freq'] = np.argmax(half_spectrum)
            f['fft_max'] = np.max(half_spectrum)
            f['fft_median'] = np.median(half_spectrum)
            f['fft_mean'] = np.mean(half_spectrum)


            features.append(f)
        return features # type: ignore

In [49]:
class ModelTrainer:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.final_model = None

    def visualize_model_performances(self, model):

        try:
            print("Valutazione del modello...")
            evaluate_model(model)
        except Exception as e:
            print(f"Errore durante la valutazione del modello: {e}")
        
        try:
            print("Creazione della matrice di confusione del modello...")
            plot_model(model, plot='confusion_matrix', save=True)
            plot_model(model, plot='confusion_matrix', save=False)
        except Exception as e:
            print(f"Errore durante la creazione della matrice di confusione: {e}")

        try:
            print("Creazione del grafico delle feature del modello...")
            plot_model(model, plot='feature_all', save=True)
            plot_model(model, plot='feature_all', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico delle feature: {e}")
        
        try:
            print("Creazione del grafico delle feature del modello (top 20)...")
            plot_model(model, plot='feature', save=True)
            plot_model(model, plot='feature', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico delle feature (top 20): {e}")
        
        try:
            print("Creazione del grafico pipeline del modello...")
            plot_model(model, plot='pipeline', save=True)
            plot_model(model, plot='pipeline', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico pipeline: {e}")

        try:
            print("Creazione curva auc...")
            plot_model(model, plot='auc', save=True)
            plot_model(model, plot='auc', save=False)
        except Exception as e:
            print(f"Errore durante la creazione della curva AUC: {e}")
        
        try:
            print("Creazione del grafico di vc...")
            plot_model(model, plot='vc', save=True)
            plot_model(model, plot='vc', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico di VC: {e}")

        try:
            print("Creazione del report di classificazione del modello...")
            plot_model(model, plot='class_report', save=True)
            plot_model(model, plot='class_report', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del report di classificazione: {e}")
        
        try:
            print("Creazione del grafico PR del modello...")
            plot_model(model, plot='pr', save=True)
            plot_model(model, plot='pr', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico PR: {e}")
        
        try:
            print("Calibrazione del modello...")
            plot_model(model, plot='calibration', save=True)
            plot_model(model, plot='calibration', save=False)
        except Exception as e:
            print(f"Errore durante la creazione del grafico di calibrazione: {e}")


    def search_and_train_single_model(self, model_search=True, n_sample_per_class=int):
        os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"
 
        if n_sample_per_class <= 0:
            features_df = self.data.reset_index(drop=True)
            labels_df = self.labels.reset_index(drop=True)
            print("No sampling done, using all data.")
        else:
            sampled_indices = (
                self.labels.groupby(self.labels.iloc[:, 0])
                .apply(lambda x: x.sample(n=n_sample_per_class, random_state=42))
                .index.get_level_values(1)
            )
            features_df = self.data.loc[sampled_indices].reset_index(drop=True)
            labels_df = self.labels.loc[sampled_indices].reset_index(drop=True)
            try:
                print(f"Training data dimensions: {features_df.shape}")
                print(f"Labels dimension: {labels_df.shape}")
            except Exception as e:
                print(f"{e}")

        if model_search:
            scaler = MinMaxScaler()
            fit_data_scaled = scaler.fit_transform(features_df)
            clf_exp = setup(data=fit_data_scaled,
                        target=labels_df['label'],
                        #pca=True,
                        #pca_method='incremental',
                        use_gpu=True,
                        feature_selection=True,
                        n_features_to_select=.4,
                        )
            best_models = compare_models(n_select=3, 
                                         exclude=['gbc', 'dummy', 'qda', 'lda', 'nb', 'svm'], # Exclude slowest models 
                                         sort='Accuracy',
                                         )

            print(f"Best model is: {best_models[0]}")
            
            print("Fine tuning the best model...")
            tuned_model = tune_model(best_models[0],
                                    optimize='Accuracy',
                                    n_iter=10,
                                    search_library='optuna',
                                    search_algorithm='tpe',
                                    choose_better=True)
            print("Trained model evalutation:")

            best_params = tuned_model.get_params()

            print("Best hyperparameters:")
            for param, value in best_params.items():
                print(f"{param}: {value}")

            self.final_model = finalize_model(tuned_model)

            # Saving trained model
            save_model(self.final_model, 'best_binary_classification_model')
            print("Final model saved as 'best_binary_classification_model'.")

            try:
                get_config("pipeline")
            except Exception as e:
                print(f"Error during config data retrieval: {e}")
            else:
                print("Pipeline configuration correctly saved.")

            # Salva l'esperimento
            save_experiment('binary_classification_experiment')
            print("Experiment saved as 'binary_classification_experiment'.")
            self.visualize_model_performances(self.final_model)
      
        
    def train_ensemble_model(self, n_sample_per_class=int):  
        os.environ["PYCARET_CUSTOM_LOGGING_LEVEL"] = "CRITICAL"

        if n_sample_per_class <= 0:
            features_df = self.data.reset_index(drop=True)
            labels_df = self.labels.reset_index(drop=True)
            print("No sampling done, using all data.")
        else:
            sampled_indices = (
                self.labels.groupby(self.labels.iloc[:, 0])
                .apply(lambda x: x.sample(n=n_sample_per_class, random_state=42))
                .index.get_level_values(1)
            )
            features_df = self.data.loc[sampled_indices].reset_index(drop=True)
            labels_df = self.labels.loc[sampled_indices].reset_index(drop=True)
            try:
                print(f"Training data dimensions: {features_df.shape}")
                print(f"Labels dimensions: {labels_df.shape}")
            except Exception as e:
                print(f"{e}")

        scaler = MinMaxScaler()
        fit_data_scaled = scaler.fit_transform(features_df)
        clf_exp = setup(data=fit_data_scaled,
                    target=labels_df['label'],
                    #pca=True,
                    #pca_method='incremental',
                    use_gpu=True,
                    feature_selection=True,
                    n_features_to_select=.4,
                    )
        
        best_models = compare_models(n_select=3, 
                                        exclude=['gbc', 'dummy', 'qda', 'lda', 'nb', 'svm'], # Exclude slowest models
                                        sort='Accuracy',
                                        )  
        
        print("Ensembling the best models...")
        best_models = [model for model in best_models if model is not None]
        print(f"Model selected for ensembling: {best_models}")
        ensembled_models = blend_models(best_models, 
                                        method='soft', 
                                        fold=5, 
                                        optimize='Accuracy', 
                                        )
        if ensembled_models is None:
            print("No ensembled models created. Please check the model selection and blending process.")
            return
        try:
            get_config("pipeline")
        except Exception as e:
            print(f"Error during the retrieval of the configurations info: {e}")
        else:
            print("Pipeline config correcly saved.")

        self.visualize_model_performances(ensembled_models)

        self.final_ensembled_model = finalize_model(ensembled_models)
        save_model(self.final_ensembled_model, 'best_binary_classification_ensembled_model')
        save_experiment('binary_classification_ensembled_experiment')

In [4]:
import json

json_path = r"E:\data\geo_k_compressed\full_data_dict.json"
with open(json_path, "r") as f:
    full_data_dict = json.load(f)

In [5]:
def dict_to_numpy(dizionario):
    """
    Converte un dizionario con struttura specificata in array numpy
    
    Args:
        dizionario: {"nome_file": {"compressed_data": [...], "labels": [...]}}
    
    Returns:
        data_matrix: array numpy (n_features, n_samples)
        labels_array: array numpy con le labels
        file_names: lista con i nomi dei file per riferimento
    """
    
    all_data = []
    all_labels = []
    
    for nome_file, contenuto in dizionario.items():
        compressed_data = contenuto["compressed_data"]
        labels = contenuto["labels"]
        
        # Verifica che il numero di labels corrisponda al numero di array
        if len(labels) != len(compressed_data):
            print(f"Attenzione: {nome_file} ha {len(compressed_data)} array ma {len(labels)} labels")
        
        # Aggiungi i dati
        for i, array_data in enumerate(compressed_data):
            all_data.append(array_data)
            all_labels.append(labels[i] if i < len(labels) else None)
            
    
    # Converti in array numpy
    data_matrix = np.array(all_data).T  # Trasponi per avere (features, samples)
    labels_array = np.array(all_labels)
    
    return data_matrix.T, labels_array,

In [6]:
full_data, full_labels = dict_to_numpy(full_data_dict)

In [7]:
full_data.shape, full_labels.shape

((2555904, 20), (2555904,))

In [40]:
features_extractor = DDMFeatureExtractor()

def extract_ddm_features_row(row):
    return features_extractor.extract_ddm_features(np.array([row]))

combined_features = Parallel(n_jobs=12, backend="loky")(delayed(extract_ddm_features_row)(row) for row in tqdm(full_data, desc="Estrazione features"))

Estrazione features: 100%|██████████| 2555904/2555904 [06:27<00:00, 6588.51it/s]


In [41]:
FEATURES=list(combined_features[0][0].keys())

In [42]:
flat_features = [row[0] if isinstance(row, list) and len(row) > 0 else row for row in combined_features]


In [None]:


del combined_features
combined_features = np.array([[row[key] for key in FEATURES] for row in flat_features])
del flat_features
combined_features.shape

# Check for NaN and infinite values
mask_finite = np.isfinite(combined_features).all(axis=1) & (np.abs(combined_features) < np.finfo(np.float64).max).all(axis=1)

fit_data_with_features_clean = combined_features[mask_finite]
labels_clean = full_labels[mask_finite]


In [44]:
fit_data_with_features_clean.shape, labels_clean.shape

((2555904, 52), (2555904,))

In [45]:
# Saving features and labels
save = True
if save:
    os.makedirs('C:/Users/atogni/Desktop/rongowai/geo-k-compression_model/binary_classification/data_w_features', exist_ok=True)
    pd.DataFrame(fit_data_with_features_clean, columns=FEATURES).to_parquet('C:/Users/atogni/Desktop/rongowai/geo-k-compression_model/binary_classification/data_w_features/combined_features.parquet', index=False)
    pd.DataFrame(labels_clean).to_parquet('C:/Users/atogni/Desktop/rongowai/geo-k-compression_model/binary_classification/data_w_features/labels_binary.parquet', index=False)

In [None]:
fit_data_with_features_df = pd.DataFrame(fit_data_with_features_clean, columns=FEATURES)
labels_clean_df = pd.DataFrame(labels_clean, columns=['0'])
del fit_data_with_features_clean
fit_data_with_features_df.head()

Unnamed: 0,mean,std,min,max,median,range,skew,kurtosis,entropy,gini,...,n_positive_diff,n_negative_diff,n_zero_diff,autocorr_lag1,autocorr_lag2,autocorr_lag3,fft_peak_freq,fft_max,fft_median,fft_mean
0,2.067257,1.556976,1e-10,5.397673,1.914673,5.397673,0.486536,-0.51207,2.651709,0.421555,...,10.0,9.0,0.0,0.235397,0.52755,0.021427,0.0,41.345137,4.968898,9.691896
1,0.839392,0.533016,1e-10,1.738595,0.912359,1.738595,-0.253523,-0.945614,2.708911,0.359373,...,8.0,11.0,0.0,-0.033208,0.221307,0.182736,0.0,16.787842,2.502832,3.759962
2,1.926672,1.369952,1e-10,4.614893,1.711636,4.614893,0.255423,-0.800592,2.673219,0.400862,...,8.0,11.0,0.0,0.232553,0.473585,0.089278,0.0,38.533433,5.81585,8.855536
3,1.417735,0.894406,1e-10,2.867296,1.418138,2.867296,-0.229625,-0.881034,2.712801,0.35478,...,7.0,12.0,0.0,0.033329,0.285966,0.205698,0.0,28.354698,3.75374,6.290839
4,2.081495,1.555314,1e-10,5.358918,1.863334,5.358918,0.456955,-0.558682,2.654891,0.418659,...,10.0,9.0,0.0,0.237994,0.52954,0.028895,0.0,41.629903,5.207048,9.691243


In [47]:
labels_clean_df

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
2555899,1
2555900,1
2555901,1
2555902,1


In [51]:
model_trainer = ModelTrainer(data=fit_data_with_features_df, labels=labels_clean_df)


In [None]:
model_trainer.search_and_train_single_model(model_search=True, n_sample_per_class=250000)

Training data dimensions: (500000, 52)
Labels dimension: (500000, 1)
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [In

Unnamed: 0,Description,Value
0,Session id,1753
1,Target,label
2,Target type,Binary
3,Original data shape,"(500000, 53)"
4,Transformed data shape,"(500000, 21)"
5,Transformed train set shape,"(350000, 21)"
6,Transformed test set shape,"(150000, 21)"
7,Numeric features,52
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8811,0.9465,0.921,0.853,0.8857,0.7623,0.7647,3.566
rf,Random Forest Classifier,0.8796,0.9455,0.9193,0.8517,0.8842,0.7592,0.7616,13.542
xgboost,Extreme Gradient Boosting,0.8735,0.9413,0.9115,0.8471,0.8781,0.7469,0.7491,1.811
knn,K Neighbors Classifier,0.8631,0.9208,0.8841,0.8485,0.8659,0.7262,0.7268,10.67
catboost,CatBoost Classifier,0.8581,0.9282,0.9056,0.827,0.8645,0.7162,0.7195,6.346
lightgbm,Light Gradient Boosting Machine,0.858,0.9287,0.9061,0.8265,0.8645,0.716,0.7193,2.211
dt,Decision Tree Classifier,0.8113,0.8113,0.8073,0.8139,0.8106,0.6227,0.6227,11.036
lr,Logistic Regression,0.8039,0.8831,0.8628,0.7719,0.8148,0.6078,0.6121,2.007
ada,Ada Boost Classifier,0.7985,0.8763,0.8408,0.7752,0.8067,0.597,0.5992,34.105
ridge,Ridge Classifier,0.7943,0.8777,0.8729,0.7543,0.8093,0.5885,0.596,1.392


Best model is: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=1753, verbose=0,
                     warm_start=False)
Fine tuning the best model...


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

KeyboardInterrupt: 