In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint
import warnings
import netCDF4
import numpy as np
import pandas as pd
import geopandas as gpd

from datetime import datetime, timedelta
from tqdm import tqdm
from shapely.geometry import Point

from scipy.stats import skew, kurtosis, entropy

from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler


import os

from pycaret.classification import *

import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

class LightGBMBinaryPipeline:
    def __init__(self, fit_data, labels_data, test_size=0.2, val_size=0.2, random_state=42):
        """
        Pipeline completa per classificatore binario LightGBM
        
        Args:
            fit_data: DataFrame o array con le features
            labels_data: Serie o array con le etichette
            test_size: dimensione del test set
            val_size: dimensione del validation set (dal training set)
            random_state: seed per riproducibilità
        """
        self.fit_data = fit_data
        self.labels_data = labels_data
        self.test_size = test_size
        self.val_size = val_size
        self.random_state = random_state
        
        # Modelli e risultati
        self.best_model = None
        self.best_params = None
        self.train_scores = {}
        self.val_scores = {}
        self.test_scores = {}
        
        # Dati preprocessati
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        
        # Feature categoriche
        self.categorical_features = []
        self.label_encoders = {}
        
        # Training history
        self.training_history = None
        
    def preprocess_data(self):
        """Preprocessa i dati e li divide in train/val/test"""
        print("🔄 Preprocessing dei dati...")
        
        # Converti in DataFrame se necessario
        if not isinstance(self.fit_data, pd.DataFrame):
            self.fit_data = pd.DataFrame(self.fit_data)
        
        # Copia per evitare modifiche ai dati originali
        X = self.fit_data.copy()
        
        # Identifica e preprocessa feature categoriche
        self.categorical_features = []
        categorical_indices = []
        
        for idx, col in enumerate(X.columns):
            if X[col].dtype == 'object' or X[col].dtype.name == 'category':
                self.categorical_features.append(col)
                categorical_indices.append(idx)
                # Label encoding per LightGBM
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
                self.label_encoders[col] = le
        
        # Salva gli indici per LightGBM
        self.categorical_indices = categorical_indices
        
        print(f"📊 Feature categoriche identificate: {self.categorical_features}")
        print(f"📊 Indici feature categoriche: {categorical_indices}")
        
        # Split train-test
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            X, self.labels_data, 
            test_size=self.test_size, 
            stratify=self.labels_data,
            random_state=self.random_state
        )
        
        # Split train-validation
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp,
            test_size=self.val_size,
            stratify=y_temp,
            random_state=self.random_state
        )
        
        print(f"📈 Dimensioni datasets:")
        print(f"  - Train: {self.X_train.shape}")
        print(f"  - Validation: {self.X_val.shape}")
        print(f"  - Test: {self.X_test.shape}")
        
        # Statistiche sulle classi
        print(f"📊 Distribuzione classi (Train): {pd.Series(self.y_train).value_counts().to_dict()}")
        
    def define_search_space(self):
        """Definisce lo spazio di ricerca per gli iperparametri di LightGBM"""
        param_distributions = {
            'n_estimators': randint(50, 1000),
            'learning_rate': uniform(0.01, 0.2),
            'max_depth': randint(3, 15),
            'num_leaves': randint(10, 300),
            'min_child_samples': randint(5, 100),
            'min_child_weight': uniform(1e-3, 10),
            'subsample': uniform(0.6, 0.4),  # 0.6 to 1.0
            'colsample_bytree': uniform(0.6, 0.4),  # 0.6 to 1.0
            'reg_alpha': uniform(0, 10),
            'reg_lambda': uniform(0, 10),
            'min_split_gain': uniform(0, 1),
            'subsample_freq': randint(0, 10),
            'class_weight': [None, 'balanced'],
        }
        return param_distributions
    
    def hyperparameter_search(self, n_iter=50, cv_folds=5, scoring='roc_auc'):
        """
        Esegue la ricerca degli iperparametri
        
        Args:
            n_iter: numero di iterazioni per RandomizedSearchCV
            cv_folds: numero di fold per cross-validation
            scoring: metrica di scoring
        """
        print(f"🔍 Avvio ricerca iperparametri ({n_iter} iterazioni, {cv_folds}-fold CV)...")
        
        # Modello base
        base_model = LGBMClassifier(
            random_state=self.random_state,
            objective='binary',
            metric='auc',
            boosting_type='gbdt',
            verbose=-1,
            n_jobs=-1,
            categorical_feature=self.categorical_indices if self.categorical_indices else 'auto'
        )
        
        # Spazio di ricerca
        param_distributions = self.define_search_space()
        
        # Cross-validation stratificata
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
        
        # RandomizedSearchCV
        random_search = RandomizedSearchCV(
            estimator=base_model,
            param_distributions=param_distributions,
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            random_state=self.random_state,
            verbose=1
        )
        
        # Fit del modello
        random_search.fit(self.X_train, self.y_train)
        
        # Salva i migliori parametri
        self.best_params = random_search.best_params_
        self.best_model = random_search.best_estimator_
        
        print(f"✅ Ricerca completata!")
        print(f"🏆 Migliore score CV: {random_search.best_score_:.4f}")
        print(f"🎯 Migliori parametri:")
        for param, value in self.best_params.items():
            print(f"  - {param}: {value}")
            
    def train_final_model(self):
        """Addestra il modello finale con i migliori iperparametri"""
        print("🚀 Addestramento modello finale...")
        
        if self.best_params is None:
            print("⚠️  Nessun iperparametro trovato. Uso parametri di default.")
            self.best_model = LGBMClassifier(
                random_state=self.random_state,
                objective='binary',
                metric='auc',
                verbose=-1,
                categorical_feature=self.categorical_indices if self.categorical_indices else 'auto'
            )
        
        # Addestramento con early stopping su validation set
        self.best_model.fit(
            self.X_train, 
            self.y_train,
            eval_set=[(self.X_val, self.y_val)],
            eval_names=['validation'],
            eval_metric='auc',
            early_stopping_rounds=50,
            verbose=100
        )
        
        # Salva la storia del training
        if hasattr(self.best_model, 'evals_result_'):
            self.training_history = self.best_model.evals_result_
        
        print("✅ Addestramento completato!")
        print(f"📊 Best iteration: {self.best_model.best_iteration}")
        
    def evaluate_model(self):
        """Valuta il modello su tutti i dataset"""
        print("📊 Valutazione del modello...")
        
        datasets = {
            'train': (self.X_train, self.y_train),
            'validation': (self.X_val, self.y_val),
            'test': (self.X_test, self.y_test)
        }
        
        all_scores = {}
        
        for name, (X, y) in datasets.items():
            # Predizioni
            y_pred = self.best_model.predict(X)
            y_pred_proba = self.best_model.predict_proba(X)[:, 1]
            
            # Metriche
            scores = {
                'accuracy': accuracy_score(y, y_pred),
                'precision': precision_score(y, y_pred),
                'recall': recall_score(y, y_pred),
                'f1': f1_score(y, y_pred),
                'roc_auc': roc_auc_score(y, y_pred_proba)
            }
            
            all_scores[name] = scores
            
            print(f"\n📈 Metriche {name.upper()}:")
            for metric, value in scores.items():
                print(f"  - {metric.upper()}: {value:.4f}")
        
        # Salva i risultati
        self.train_scores = all_scores['train']
        self.val_scores = all_scores['validation']
        self.test_scores = all_scores['test']
        
        return all_scores
    
    def plot_feature_importance(self, top_n=20, importance_type='gain'):
        """Visualizza l'importanza delle feature"""
        if self.best_model is None:
            print("⚠️  Modello non ancora addestrato!")
            return
            
        # Ottieni feature importance
        feature_names = self.X_train.columns if hasattr(self.X_train, 'columns') else [f'feature_{i}' for i in range(self.X_train.shape[1])]
        
        # LightGBM supporta diversi tipi di importance
        importance = self.best_model.feature_importances_
        
        # Crea DataFrame per plotting
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False).head(top_n)
        
        # Plot
        plt.figure(figsize=(12, 8))
        sns.barplot(data=fi_df, x='importance', y='feature')
        plt.title(f'Top {top_n} Feature Importance (LightGBM - {importance_type})')
        plt.xlabel('Importance Score')
        plt.tight_layout()
        plt.show()
        
        # Plot aggiuntivo con importanza nativa di LightGBM
        try:
            lgb.plot_importance(self.best_model, max_num_features=top_n, importance_type=importance_type, figsize=(12, 8))
            plt.title(f'LightGBM Native Feature Importance ({importance_type})')
            plt.tight_layout()
            plt.show()
        except:
            print("⚠️  Impossibile creare il plot nativo di LightGBM")
    
    def plot_training_history(self):
        """Visualizza la storia del training"""
        if self.training_history is None:
            print("⚠️  Storia del training non disponibile!")
            return
            
        plt.figure(figsize=(12, 6))
        
        # Plot delle metriche di training
        for dataset_name, metrics in self.training_history.items():
            for metric_name, values in metrics.items():
                plt.plot(values, label=f'{dataset_name}_{metric_name}')
        
        plt.title('Training History - LightGBM')
        plt.xlabel('Iterations')
        plt.ylabel('Metric Value')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Aggiungi linea verticale per best iteration
        if hasattr(self.best_model, 'best_iteration'):
            plt.axvline(x=self.best_model.best_iteration, color='red', linestyle='--', 
                       label=f'Best Iteration ({self.best_model.best_iteration})')
            plt.legend()
        
        plt.tight_layout()
        plt.show()
    
    def plot_confusion_matrices(self):
        """Visualizza le confusion matrix per tutti i dataset"""
        datasets = {
            'Train': (self.X_train, self.y_train),
            'Validation': (self.X_val, self.y_val),
            'Test': (self.X_test, self.y_test)
        }
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        for idx, (name, (X, y)) in enumerate(datasets.items()):
            y_pred = self.best_model.predict(X)
            cm = confusion_matrix(y, y_pred)
            
            sns.heatmap(cm, annot=True, fmt='d', ax=axes[idx], cmap='Blues')
            axes[idx].set_title(f'Confusion Matrix - {name}')
            axes[idx].set_xlabel('Predicted')
            axes[idx].set_ylabel('Actual')
        
        plt.tight_layout()
        plt.show()
    
    def plot_tree_structure(self, tree_index=0):
        """Visualizza la struttura di un albero specifico"""
        try:
            lgb.plot_tree(self.best_model, tree_index=tree_index, figsize=(20, 15), show_info=['split_gain'])
            plt.title(f'LightGBM Tree Structure (Tree {tree_index})')
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"⚠️  Impossibile visualizzare l'albero: {e}")
            print("Assicurati di avere graphviz installato: pip install graphviz")
    
    def get_classification_reports(self):
        """Stampa i classification report dettagliati"""
        datasets = {
            'Train': (self.X_train, self.y_train),
            'Validation': (self.X_val, self.y_val),
            'Test': (self.X_test, self.y_test)
        }
        
        for name, (X, y) in datasets.items():
            y_pred = self.best_model.predict(X)
            print(f"\n📋 CLASSIFICATION REPORT - {name.upper()}")
            print("=" * 50)
            print(classification_report(y, y_pred))
    
    def get_model_complexity_info(self):
        """Informazioni sulla complessità del modello"""
        if self.best_model is None:
            print("⚠️  Modello non ancora addestrato!")
            return
            
        print(f"\n🌲 INFORMAZIONI MODELLO LIGHTGBM:")
        print(f"  - Numero alberi: {self.best_model.n_estimators}")
        print(f"  - Learning rate: {self.best_model.learning_rate}")
        print(f"  - Max depth: {self.best_model.max_depth}")
        print(f"  - Num leaves: {self.best_model.num_leaves}")
        print(f"  - Min child samples: {self.best_model.min_child_samples}")
        print(f"  - Subsample: {self.best_model.subsample}")
        print(f"  - Feature fraction: {self.best_model.colsample_bytree}")
        if hasattr(self.best_model, 'best_iteration'):
            print(f"  - Best iteration: {self.best_model.best_iteration}")
        print(f"  - Objective: {self.best_model.objective}")
        print(f"  - Boosting type: {self.best_model.boosting_type}")
    
    def analyze_overfitting(self):
        """Analizza il potenziale overfitting"""
        print(f"\n🔍 ANALISI OVERFITTING:")
        
        train_auc = self.train_scores['roc_auc']
        val_auc = self.val_scores['roc_auc']
        test_auc = self.test_scores['roc_auc']
        
        train_val_gap = train_auc - val_auc
        train_test_gap = train_auc - test_auc
        
        print(f"  - AUC Train: {train_auc:.4f}")
        print(f"  - AUC Validation: {val_auc:.4f}")
        print(f"  - AUC Test: {test_auc:.4f}")
        print(f"  - Gap Train-Val: {train_val_gap:.4f}")
        print(f"  - Gap Train-Test: {train_test_gap:.4f}")
        
        if train_val_gap > 0.05:
            print("  ⚠️  Possibile overfitting (gap > 0.05)")
            print("  💡 Considera: early stopping più aggressivo, regolarizzazione, meno iterazioni")
        elif train_val_gap < 0.01:
            print("  ✅ Buon bilanciamento bias-variance")
        else:
            print("  ⚖️  Overfitting moderato, accettabile")
    
    def save_model(self, filepath):
        """Salva il modello addestrato"""
        if self.best_model is None:
            print("⚠️  Nessun modello da salvare!")
            return
            
        try:
            self.best_model.booster_.save_model(filepath)
            print(f"✅ Modello salvato in: {filepath}")
        except Exception as e:
            print(f"❌ Errore nel salvataggio: {e}")
    
    def load_model(self, filepath):
        """Carica un modello salvato"""
        try:
            self.best_model = lgb.Booster(model_file=filepath)
            print(f"✅ Modello caricato da: {filepath}")
        except Exception as e:
            print(f"❌ Errore nel caricamento: {e}")
    
    def run_complete_pipeline(self, n_iter=50, cv_folds=5):
        """Esegue la pipeline completa"""
        print("🚀 AVVIO PIPELINE COMPLETA LIGHTGBM")
        print("=" * 50)
        
        # 1. Preprocessing
        self.preprocess_data()
        
        # 2. Ricerca iperparametri
        self.hyperparameter_search(n_iter=n_iter, cv_folds=cv_folds)
        
        # 3. Addestramento finale
        self.train_final_model()
        
        # 4. Valutazione
        scores = self.evaluate_model()
        
        # 5. Informazioni modello
        self.get_model_complexity_info()
        
        # 6. Analisi overfitting
        self.analyze_overfitting()
        
        # 7. Report dettagliati
        self.get_classification_reports()
        
        # 8. Visualizzazioni
        self.plot_feature_importance()
        self.plot_training_history()
        self.plot_confusion_matrices()
        
        # 9. Visualizzazione albero (opzionale)
        print("\n🌳 Visualizzazione struttura albero (può richiedere graphviz)...")
        self.plot_tree_structure(tree_index=0)
        
        print("\n🎉 PIPELINE COMPLETATA CON SUCCESSO!")
        print(f"🏆 Performance finale (Test Set): AUC = {self.test_scores['roc_auc']:.4f}")
        
        return self.best_model, self.best_params, scores

# UTILITY FUNCTIONS PER LIGHTGBM
def compare_lgb_importance_types(model, feature_names, top_n=15):
    """Confronta diversi tipi di feature importance in LightGBM"""
    importance_types = ['split', 'gain']
    
    fig, axes = plt.subplots(1, len(importance_types), figsize=(15, 8))
    if len(importance_types) == 1:
        axes = [axes]
    
    for idx, imp_type in enumerate(importance_types):
        try:
            lgb.plot_importance(model, max_num_features=top_n, 
                              importance_type=imp_type, ax=axes[idx])
            axes[idx].set_title(f'Feature Importance ({imp_type})')
        except:
            axes[idx].text(0.5, 0.5, f'Errore plot {imp_type}', 
                          ha='center', va='center', transform=axes[idx].transAxes)
    
    plt.tight_layout()
    plt.show()

def lgb_hyperparameter_ranges_analysis(pipeline):
    """Analizza i range degli iperparametri trovati"""
    if pipeline.best_params is None:
        print("⚠️  Nessun parametro da analizzare!")
        return
    
    print("\n📊 ANALISI IPERPARAMETRI OTTIMALI:")
    print("=" * 40)
    
    param_analysis = {
        'n_estimators': 'Numero di alberi nel boosting',
        'learning_rate': 'Tasso di apprendimento (eta)',
        'max_depth': 'Profondità massima alberi',
        'num_leaves': 'Numero max foglie per albero',
        'min_child_samples': 'Campioni minimi per foglia',
        'subsample': 'Frazione campioni per training',
        'colsample_bytree': 'Frazione features per albero',
        'reg_alpha': 'Regolarizzazione L1',
        'reg_lambda': 'Regolarizzazione L2'
    }
    
    for param, description in param_analysis.items():
        if param in pipeline.best_params:
            value = pipeline.best_params[param]
            print(f"  - {param}: {value}")
            print(f"    └─ {description}")



    


In [None]:
class NetCDFPreprocessor:

    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.netcdf_file_list = os.listdir(root_dir)

    def preprocess(self, f):
        
        raw_counts = f.variables['raw_counts']
        raw_counts = np.array(raw_counts)

        # Calcolo distanza tra il punto speculare e l'aereo
        ac_alt_2d = np.repeat(np.array(f.variables['ac_alt'])[:, np.newaxis], 20, axis=1)
        distance_2d = (ac_alt_2d - f.variables['sp_alt'][:]) / np.cos(np.deg2rad(f.variables['sp_inc_angle'][:]))

        # Seleziona gli indici dove sp_rx_gain_copol > 5, sp_rx_gain_xpol > 5 e ddm_snr > 0 e distanza tra punto speculare e antenna > 2000 e < 10000
        copol = f.variables['sp_rx_gain_copol'][:]
        xpol = f.variables['sp_rx_gain_xpol'][:]
        snr = f.variables['ddm_snr'][:]
        dist = distance_2d[:]

        keep_mask = (copol >= 5) & (xpol >= 5) & (snr > 0) & ((dist >= 2000) & (dist <= 10000)) & (~np.isnan(copol.data) & ~np.isnan(xpol.data) & ~np.isnan(snr.data) & ~np.isnan(dist.data))
        to_keep_indices = np.argwhere(keep_mask)
        
        filtered_raw_counts = [raw_counts[i, j] for i, j in to_keep_indices]
        #filtered_raw_counts_arr = np.array(filtered_raw_counts)

        output_array = np.full(raw_counts.shape, np.nan, dtype=np.float32)

        # Inserisci i dati filtrati nelle posizioni di to_keep_indices
        for idx, (i, j) in enumerate(to_keep_indices):
            output_array[i, j] = filtered_raw_counts[idx]

        raw_counts_filtered = output_array.copy()
        del output_array

        ddm_data_dict = {
            'Raw_Counts': raw_counts_filtered.reshape(raw_counts_filtered.shape[0]*raw_counts_filtered.shape[1], raw_counts_filtered.shape[2], raw_counts_filtered.shape[3]),
        }
        keep_indices = np.where(
            np.all(~np.isnan(ddm_data_dict['Raw_Counts']), axis=(1, 2)) & (np.sum(ddm_data_dict['Raw_Counts'], axis=(1, 2)) > 0)
        )[0]
        fit_data = np.array([ddm_data_dict['Raw_Counts'][f].ravel() for f in keep_indices])
        surface_types = f.variables["sp_surface_type"][:]
        surface_types = np.nan_to_num(surface_types, nan=0)
        surface_types_unravelled = surface_types.ravel()

        label_data = surface_types_unravelled
        label_data = [label_data[l] for l in range(len(label_data)) if l in keep_indices]

        return fit_data, label_data

    def process_all_files(self, chunk_size = int, sample_fraction = float, remove_chunks= bool):
        
        full_data = []
        full_labels = []
        #counter = 0
        for file_name in tqdm(self.netcdf_file_list, desc="Processing files"):
            if not file_name.endswith('.nc'):
                continue
            try:
                f = netCDF4.Dataset(f'{self.root_dir}{file_name}')
                data, labels = self.preprocess(f)
                full_data.append(data)
                full_labels.append(labels)
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")
                continue
            #counter += 1
            #if counter == 100:  # Limita a 50 file per il caricamento
            #    break
        
        # Trova gli indici degli elementi di full_data con seconda dimensione uguale a 200
        valid_indices = [i for i, arr in enumerate(full_data) if arr.ndim == 2 if arr.shape[1] == 200]

        # Applica la selezione a full_data e full_labels
        full_data_clean = [full_data[i] for i in valid_indices]
        full_labels_clean = [full_labels[i] for i in valid_indices]

        # Chunking 
        
        os.makedirs('processed_data/multiclass', exist_ok=True)

        chunk_size = chunk_size # dimensione del chunk in numero di campioni
        sample_fraction = sample_fraction  # frazione di dati da campionare per ogni chunk

        full_data_sampled = []
        full_labels_sampled = []

        num_chunks = int(np.ceil(len(full_data_clean) / chunk_size))
        for idx in range(num_chunks):
            start = idx * chunk_size
            end = min((idx + 1) * chunk_size, len(full_data_clean))
            chunk_data = np.vstack(full_data_clean[start:end])
            chunk_labels = np.hstack(full_labels_clean[start:end])
            print(f"Distribuzione etichette nel chunk: {Counter(chunk_labels)}")
            
            print(f"Chunk {idx + 1}/{num_chunks} processed with shape {chunk_data.shape} and labels shape {chunk_labels.shape}")
            print(''-' * 50')
            

            # Salva ogni chunk come file parquet separato
            fit_data_df = pd.DataFrame(chunk_data)
            labels_df = pd.DataFrame(chunk_labels, columns=['label'])

            table_fit = pa.Table.from_pandas(fit_data_df, preserve_index=False)
            table_labels = pa.Table.from_pandas(labels_df, preserve_index=False)

            pq.write_table(
                table_fit,
                f'processed_data/multiclass/fit_data_chunk_{idx}.parquet',
                compression='zstd',
                use_dictionary=True,
            )
            pq.write_table(
                table_labels,
                f'processed_data/multiclass/labels_chunk_{idx}.parquet',
                compression='zstd',
                use_dictionary=True,
            )

        # Imposta la frazione di dati da campionare per ogni chunk (es: 0.2 per il 20%)
        
            #_, X_sampled, _, y_sampled = train_test_split(
            #    chunk_data, chunk_labels, 
            #    test_size=sample_fraction, 
            #    stratify=chunk_labels, 
            #    random_state=42
            #) 

            # Trova le classi più rare (meno rappresentate)
            label_counts = Counter(chunk_labels)
            min_count = min(label_counts.values())
            rare_classes = [cls for cls, count in label_counts.items() if count == min_count]

            # Seleziona tutte le occorrenze delle classi rare
            rare_indices = np.isin(chunk_labels, rare_classes)
            X_rare = chunk_data[rare_indices]
            y_rare = chunk_labels[rare_indices]

            # Per le altre classi, esegui un campionamento casuale per raggiungere la frazione desiderata
            other_indices = ~rare_indices
            X_other = chunk_data[other_indices]
            y_other = chunk_labels[other_indices]

            _, X_other_sampled, _, y_other_sampled = train_test_split(
                X_other, y_other,
                test_size=sample_fraction,
                stratify=y_other,
                random_state=42
            )

            # Combina i dati delle classi rare con quelli campionati delle altre classi
            X_sampled = np.vstack([X_rare, X_other_sampled])
            y_sampled = np.hstack([y_rare, y_other_sampled])

            print(f"Distribuzione etichette prima del campionamento nel chunk {idx + 1}: {Counter(chunk_labels)}")
            print(f"Distribuzione etichette dopo campionamento nel chunk {idx + 1}: {Counter(y_sampled)}")
            
            del full_data, full_labels
            
            full_data_sampled.append(X_sampled)
            full_labels_sampled.append(y_sampled)

        full_data_sampled_stratified = np.vstack(full_data_sampled)
        full_labels_sampled_stratified = np.hstack(full_labels_sampled)

        print(f"Distribuzione totale etichette dopo stratificazione: {Counter(full_labels_sampled_stratified)}")

        
        print(f"Shape of sampled data after chunking and sampling: {np.array(full_data_sampled_stratified).shape}")
        print(f"Shape of sampled labels after chunking and sampling: {np.array(full_labels_sampled_stratified).shape}")
        
        # Crea la cartella processed_data se non esiste
        os.makedirs('processed_data/multiclass', exist_ok=True)

        # Salva fit_data in formato parquet ottimizzato
        fit_data_df = pd.DataFrame(full_data_sampled_stratified)
        table_fit = pa.Table.from_pandas(fit_data_df, preserve_index=False)
        pq.write_table(
            table_fit,
            'processed_data/multiclass/fit_data_multiclass.parquet',
            compression='zstd',
            use_dictionary=True,
            
        )

        # Salva labels in formato parquet ottimizzato
        labels_df = pd.DataFrame(full_labels_sampled_stratified, columns=['label'])
        table_labels = pa.Table.from_pandas(labels_df, preserve_index=False)
        pq.write_table(
            table_labels,
            'processed_data/multiclass/labels_multiclass.parquet',
            compression='zstd',
            use_dictionary=True,
            
        )

        del fit_data_df, labels_df, table_fit, table_labels
        # Remove all chunk parquet files if flag is set
        if remove_chunks:
            try:
                chunk_dir = 'processed_data/multiclass'
                for fname in os.listdir(chunk_dir):
                    if fname.startswith('fit_data_chunk_') or fname.startswith('labels_chunk_'):
                        os.remove(os.path.join(chunk_dir, fname))
                print("All chunk files removed.")
            except Exception as e:
                print(f"Error removing chunk files: {e}")

        return full_data_sampled_stratified, full_labels_sampled_stratified

In [None]:
class DDMFeatureExtractor:
    def __init__(self):
        pass
    
    def create_ddm_features_MORE(self, fit_data: np.ndarray) -> pd.DataFrame:
        

        """
        Estrae features dettagliate da raw_counts DDM (shape: n_samples x 200)
        """

        def gini(array):
            """Calcola il coefficiente di Gini (disuguaglianza)"""
            array = np.sort(array)
            index = np.arange(1, array.shape[0] + 1)
            return (np.sum((2 * index - array.shape[0] - 1) * array)) / (array.shape[0] * np.sum(array))
        

        features = []

        for row in tqdm(fit_data, desc="Extracting DDM features"):
            f = {}
            x = np.array(row, dtype=np.float32) + 1e-10  # evita log(0)

            # 1. Statistiche base
            f['mean'] = np.mean(x)
            f['std'] = np.std(x)
            f['min'] = np.min(x)
            f['max'] = np.max(x)
            f['median'] = np.median(x)
            f['range'] = np.max(x) - np.min(x)
            f['skew'] = skew(x)
            f['kurtosis'] = kurtosis(x)
            f['entropy'] = entropy(x)
            f['gini'] = gini(x)

            # 2. Posizionali
            f['peak_index'] = np.argmax(x)
            f['peak_value'] = np.max(x)
            f['center_of_mass'] = np.sum(np.arange(len(x)) * x) / np.sum(x)
            f['inertia'] = np.sum(((np.arange(len(x)) - f['center_of_mass'])**2) * x)

            # 3. Segmentazione
            thirds = np.array_split(x, 3)
            for i, part in enumerate(thirds):
                f[f'sum_third_{i+1}'] = np.sum(part)
                f[f'mean_third_{i+1}'] = np.mean(part)
                f[f'max_third_{i+1}'] = np.max(part)
            
            windows = np.array_split(x, 5)
            for i, w in enumerate(windows):
                f[f'mean_w{i+1}'] = np.mean(w)
                f[f'std_w{i+1}'] = np.std(w)
                f[f'max_w{i+1}'] = np.max(w)

            # 4. Derivate e cambiamenti
            dx = np.diff(x)
            f['mean_diff'] = np.mean(dx)
            f['std_diff'] = np.std(dx)
            f['max_diff'] = np.max(dx)
            f['min_diff'] = np.min(dx)
            f['n_positive_diff'] = np.sum(dx > 0)
            f['n_negative_diff'] = np.sum(dx < 0)
            f['n_zero_diff'] = np.sum(dx == 0)

            # 5. Autocorrelazioni (lag 1-3)
            for lag in range(1, 4):
                ac = np.corrcoef(x[:-lag], x[lag:])[0, 1] if len(x) > lag else np.nan
                f[f'autocorr_lag{lag}'] = ac

            # 6. FFT (spettro frequenze)
            spectrum = np.abs(fft(x))
            half_spectrum = spectrum[:len(spectrum)//2]  # simmetrico
            f['fft_peak_freq'] = np.argmax(half_spectrum)
            f['fft_max'] = np.max(half_spectrum)
            f['fft_median'] = np.median(half_spectrum)
            f['fft_mean'] = np.mean(half_spectrum)

            features.append(f)

        return pd.DataFrame(features)

In [None]:
ROOT_DIR= 'D:/data/RONGOWAI_L1_SDR_V1.0/'

read_from_backup = True
if read_from_backup:
    #import polars as pl

    # Leggi i file parquet con polars
    fit_data_pl = pd.read_parquet('processed_data/multiclass/fit_data_multiclass.parquet')
    labels_pl = pd.read_parquet('processed_data/multiclass/labels_multiclass.parquet')

    # Trasforma in numpy array
    fit_data = fit_data_pl.to_numpy()
    labels = labels_pl['label'].to_numpy()

In [None]:
features_extractor = DDMFeatureExtractor()
ddm_features = features_extractor.create_ddm_features_MORE(fit_data)
fit_data_with_features = np.hstack([fit_data, ddm_features.values])
fit_data_with_features.shape

In [None]:
# Inizializza e esegui la pipeline
pipeline = LightGBMBinaryPipeline(fit_data, labels)

# Esegui pipeline completa
best_model, best_params, final_scores = pipeline.run_complete_pipeline(
    n_iter=30,  # Riduci per test più veloce
    cv_folds=5
)

print(f"\n🎯 RISULTATI FINALI:")
print(f"Migliori parametri: {best_params}")
print(f"Score finale: {final_scores}")

# Analisi aggiuntive
lgb_hyperparameter_ranges_analysis(pipeline)

# Confronto tipi di importance
if hasattr(pipeline.X_train, 'columns'):
    compare_lgb_importance_types(best_model, pipeline.X_train.columns)

# Salvataggio modello (opzionale)
# pipeline.save_model('best_lightgbm_model.txt')