In [2]:
import netCDF4
import numpy as np
import pandas as pd
import geopandas as gpd

from datetime import datetime, timedelta
from tqdm import tqdm
from shapely.geometry import Point

from scipy.stats import skew, kurtosis, entropy

from scipy.fft import fft
from sklearn.preprocessing import MinMaxScaler


import os

from pycaret.classification import *

import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import catboost as cb

In [3]:
class DDMFeatureExtractor:
    def __init__(self):
        pass

    def create_ddm_features_MORE(self, fit_data: np.ndarray) -> pd.DataFrame:

        """
        Estrae features dettagliate da raw_counts DDM (shape: n_samples x 200)
        """

        def gini(array):
            """Calcola il coefficiente di Gini (disuguaglianza)"""
            array = np.sort(array)
            index = np.arange(1, array.shape[0] + 1)
            return (np.sum((2 * index - array.shape[0] - 1) * array)) / (array.shape[0] * np.sum(array))
        
        features = []

        for row in tqdm(fit_data, desc="Extracting DDM features"):
            f = {}
            x = np.array(row, dtype=np.float32) + 1e-10  # evita log(0)

            # 1. Statistiche base
            f['mean'] = np.mean(x)
            f['std'] = np.std(x)
            f['min'] = np.min(x)
            f['max'] = np.max(x)
            f['median'] = np.median(x)
            f['range'] = np.max(x) - np.min(x)
            f['skew'] = skew(x)
            f['kurtosis'] = kurtosis(x)
            f['entropy'] = entropy(x)
            f['gini'] = gini(x)

            # 2. Posizionali
            f['peak_index'] = np.argmax(x)
            f['peak_value'] = np.max(x)
            f['center_of_mass'] = np.sum(np.arange(len(x)) * x) / np.sum(x)
            f['inertia'] = np.sum(((np.arange(len(x)) - f['center_of_mass'])**2) * x)

            # 3. Segmentazione
            thirds = np.array_split(x, 3)
            for i, part in enumerate(thirds):
                f[f'sum_third_{i+1}'] = np.sum(part)
                f[f'mean_third_{i+1}'] = np.mean(part)
                f[f'max_third_{i+1}'] = np.max(part)
            
            windows = np.array_split(x, 5)
            for i, w in enumerate(windows):
                f[f'mean_w{i+1}'] = np.mean(w)
                f[f'std_w{i+1}'] = np.std(w)
                f[f'max_w{i+1}'] = np.max(w)

            # 4. Derivate e cambiamenti
            dx = np.diff(x)
            f['mean_diff'] = np.mean(dx)
            f['std_diff'] = np.std(dx)
            f['max_diff'] = np.max(dx)
            f['min_diff'] = np.min(dx)
            f['n_positive_diff'] = np.sum(dx > 0)
            f['n_negative_diff'] = np.sum(dx < 0)
            f['n_zero_diff'] = np.sum(dx == 0)

            # 5. Autocorrelazioni (lag 1-3)
            for lag in range(1, 4):
                ac = np.corrcoef(x[:-lag], x[lag:])[0, 1] if len(x) > lag else np.nan
                f[f'autocorr_lag{lag}'] = ac

            # 6. FFT (spettro frequenze)
            spectrum = np.abs(fft(x))
            half_spectrum = spectrum[:len(spectrum)//2]  # simmetrico
            f['fft_peak_freq'] = np.argmax(half_spectrum)
            f['fft_max'] = np.max(half_spectrum)
            f['fft_median'] = np.median(half_spectrum)
            f['fft_mean'] = np.mean(half_spectrum)

            features.append(f)

        return pd.DataFrame(features)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import catboost as cb
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')

class CatBoostBinaryPipeline:
    def __init__(self, fit_data, labels_data, test_size=0.2, val_size=0.2, random_state=42):
        """
        Pipeline completa per classificatore binario CatBoost
        
        Args:
            fit_data: DataFrame o array con le features
            labels_data: Serie o array con le etichette
            test_size: dimensione del test set
            val_size: dimensione del validation set (dal training set)
            random_state: seed per riproducibilità
        """
        self.fit_data = fit_data
        self.labels_data = labels_data
        self.test_size = test_size
        self.val_size = val_size
        self.random_state = random_state
        
        # Modelli e risultati
        self.best_model = None
        self.best_params = None
        self.train_scores = {}
        self.val_scores = {}
        self.test_scores = {}
        
        # Dati preprocessati
        self.X_train = None
        self.X_val = None
        self.X_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None
        
        # Feature categoriche
        self.categorical_features = []
        
    def preprocess_data(self):
        """Preprocessa i dati e li divide in train/val/test"""
        print("🔄 Preprocessing dei dati...")
        
        # Converti in DataFrame se necessario
        if not isinstance(self.fit_data, pd.DataFrame):
            self.fit_data = pd.DataFrame(self.fit_data)
        
        # Identifica feature categoriche
        self.categorical_features = []
        for col in self.fit_data.columns:
            if self.fit_data[col].dtype == 'object' or self.fit_data[col].dtype.name == 'category':
                self.categorical_features.append(col)
        
        print(f"📊 Feature categoriche identificate: {self.categorical_features}")
        
        # Split train-test
        X_temp, self.X_test, y_temp, self.y_test = train_test_split(
            self.fit_data, self.labels_data, 
            test_size=self.test_size, 
            stratify=self.labels_data,
            random_state=self.random_state
        )
        
        # Split train-validation
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_temp, y_temp,
            test_size=self.val_size,
            stratify=y_temp,
            random_state=self.random_state
        )
        
        print(f"📈 Dimensioni datasets:")
        print(f"  - Train: {self.X_train.shape}")
        print(f"  - Validation: {self.X_val.shape}")
        print(f"  - Test: {self.X_test.shape}")
        
        # Statistiche sulle classi
        print(f"📊 Distribuzione classi (Train): {pd.Series(self.y_train).value_counts().to_dict()}")
        
    def define_search_space(self):
        """Definisce lo spazio di ricerca per gli iperparametri"""
        param_distributions = {
            'iterations': randint(100, 1000),
            'learning_rate': uniform(0.01, 0.2),
            'depth': randint(3, 10),
            'l2_leaf_reg': uniform(1, 10),
            'border_count': randint(32, 255),
            'bagging_temperature': uniform(0, 1),
            'random_strength': uniform(0, 10),
            'subsample': uniform(0.6, 0.4),  # 0.6 to 1.0
        }
        return param_distributions
    
    def hyperparameter_search(self, n_iter=50, cv_folds=5, scoring='roc_auc'):
        """
        Esegue la ricerca degli iperparametri
        
        Args:
            n_iter: numero di iterazioni per RandomizedSearchCV
            cv_folds: numero di fold per cross-validation
            scoring: metrica di scoring
        """
        print(f"🔍 Avvio ricerca iperparametri ({n_iter} iterazioni, {cv_folds}-fold CV)...")
        
        # Modello base
        base_model = CatBoostClassifier(
            random_state=self.random_state,
            verbose=False,
            cat_features=self.categorical_features,
            eval_metric='AUC',
            early_stopping_rounds=50
        )
        
        # Spazio di ricerca
        param_distributions = self.define_search_space()
        
        # Cross-validation stratificata
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
        
        # RandomizedSearchCV
        random_search = RandomizedSearchCV(
            estimator=base_model,
            param_distributions=param_distributions,
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            random_state=self.random_state,
            verbose=1
        )
        
        # Fit del modello
        random_search.fit(self.X_train, self.y_train)
        
        # Salva i migliori parametri
        self.best_params = random_search.best_params_
        self.best_model = random_search.best_estimator_
        
        print(f"✅ Ricerca completata!")
        print(f"🏆 Migliore score CV: {random_search.best_score_:.4f}")
        print(f"🎯 Migliori parametri:")
        for param, value in self.best_params.items():
            print(f"  - {param}: {value}")
            
    def train_final_model(self):
        """Addestra il modello finale con i migliori iperparametri"""
        print("🚀 Addestramento modello finale...")
        
        if self.best_params is None:
            print("⚠️  Nessun iperparametro trovato. Uso parametri di default.")
            self.best_model = CatBoostClassifier(
                random_state=self.random_state,
                verbose=False,
                cat_features=self.categorical_features,
                eval_metric='AUC'
            )
        
        # Pool per CatBoost (migliori performance)
        train_pool = Pool(
            data=self.X_train,
            label=self.y_train,
            cat_features=self.categorical_features
        )
        
        val_pool = Pool(
            data=self.X_val,
            label=self.y_val,
            cat_features=self.categorical_features
        )
        
        # Addestramento con early stopping
        self.best_model.fit(
            train_pool,
            eval_set=val_pool,
            verbose=100,
            plot=False
        )
        
        print("✅ Addestramento completato!")
        
    def evaluate_model(self):
        """Valuta il modello su tutti i dataset"""
        print("📊 Valutazione del modello...")
        
        datasets = {
            'train': (self.X_train, self.y_train),
            'validation': (self.X_val, self.y_val),
            'test': (self.X_test, self.y_test)
        }
        
        all_scores = {}
        
        for name, (X, y) in datasets.items():
            # Predizioni
            y_pred = self.best_model.predict(X)
            y_pred_proba = self.best_model.predict_proba(X)[:, 1]
            
            # Metriche
            scores = {
                'accuracy': accuracy_score(y, y_pred),
                'precision': precision_score(y, y_pred),
                'recall': recall_score(y, y_pred),
                'f1': f1_score(y, y_pred),
                'roc_auc': roc_auc_score(y, y_pred_proba)
            }
            
            all_scores[name] = scores
            
            print(f"\n📈 Metriche {name.upper()}:")
            for metric, value in scores.items():
                print(f"  - {metric.upper()}: {value:.4f}")
        
        # Salva i risultati
        self.train_scores = all_scores['train']
        self.val_scores = all_scores['validation']
        self.test_scores = all_scores['test']
        
        return all_scores
    
    def plot_feature_importance(self, top_n=20):
        """Visualizza l'importanza delle feature"""
        if self.best_model is None:
            print("⚠️  Modello non ancora addestrato!")
            return
            
        # Ottieni feature importance
        feature_names = self.X_train.columns if hasattr(self.X_train, 'columns') else [f'feature_{i}' for i in range(self.X_train.shape[1])]
        importance = self.best_model.get_feature_importance()
        
        # Crea DataFrame per plotting
        fi_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False).head(top_n)
        
        # Plot
        plt.figure(figsize=(10, 8))
        sns.barplot(data=fi_df, x='importance', y='feature')
        plt.title(f'Top {top_n} Feature Importance')
        plt.xlabel('Importance Score')
        plt.tight_layout()
        plt.show()
        
    def plot_confusion_matrices(self):
        """Visualizza le confusion matrix per tutti i dataset"""
        datasets = {
            'Train': (self.X_train, self.y_train),
            'Validation': (self.X_val, self.y_val),
            'Test': (self.X_test, self.y_test)
        }
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 4))
        
        for idx, (name, (X, y)) in enumerate(datasets.items()):
            y_pred = self.best_model.predict(X)
            cm = confusion_matrix(y, y_pred)
            
            sns.heatmap(cm, annot=True, fmt='d', ax=axes[idx], cmap='Blues')
            axes[idx].set_title(f'Confusion Matrix - {name}')
            axes[idx].set_xlabel('Predicted')
            axes[idx].set_ylabel('Actual')
        
        plt.tight_layout()
        plt.show()
    
    def get_classification_reports(self):
        """Stampa i classification report dettagliati"""
        datasets = {
            'Train': (self.X_train, self.y_train),
            'Validation': (self.X_val, self.y_val),
            'Test': (self.X_test, self.y_test)
        }
        
        for name, (X, y) in datasets.items():
            y_pred = self.best_model.predict(X)
            print(f"\n📋 CLASSIFICATION REPORT - {name.upper()}")
            print("=" * 50)
            print(classification_report(y, y_pred))
    
    def run_complete_pipeline(self, n_iter=50, cv_folds=5):
        """Esegue la pipeline completa"""
        print("🚀 AVVIO PIPELINE COMPLETA CATBOOST")
        print("=" * 50)
        
        # 1. Preprocessing
        self.preprocess_data()
        
        # 2. Ricerca iperparametri
        self.hyperparameter_search(n_iter=n_iter, cv_folds=cv_folds)
        
        # 3. Addestramento finale
        self.train_final_model()
        
        # 4. Valutazione
        scores = self.evaluate_model()
        
        # 5. Report dettagliati
        self.get_classification_reports()
        
        # 6. Visualizzazioni
        self.plot_feature_importance()
        self.plot_confusion_matrices()
        
        print("\n🎉 PIPELINE COMPLETATA CON SUCCESSO!")
        print(f"🏆 Performance finale (Test Set): AUC = {self.test_scores['roc_auc']:.4f}")
        
        return self.best_model, self.best_params, scores

In [None]:
ROOT_DIR= 'D:/data/RONGOWAI_L1_SDR_V1.0/'

read_from_backup = True
if read_from_backup:
    #import polars as pl

    # Leggi i file parquet con polars
    fit_data_pl = pd.read_parquet('processed_data/multiclass/fit_data_multiclass.parquet')
    labels_pl = pd.read_parquet('processed_data/multiclass/labels_multiclass.parquet')

    # Trasforma in numpy array
    fit_data = fit_data_pl.to_numpy()
    labels = labels_pl['label'].to_numpy()

In [10]:
features_extractor = DDMFeatureExtractor()
ddm_features = features_extractor.create_ddm_features_MORE(fit_data)

Extracting DDM features:  11%|█         | 190424/1795932 [02:03<17:20, 1542.55it/s]


KeyboardInterrupt: 

In [None]:
fit_data_with_features = np.hstack([fit_data, ddm_features.values])
fit_data_with_features.shape

NameError: name 'fit_data' is not defined

In [None]:
pipeline = CatBoostBinaryPipeline(fit_data, labels)
best_model, best_params, scores = pipeline.run_complete_pipeline()