In [2]:
import pandas as pd
from oversampling_multiclass_extra import (
    split_dataset,
    apply_smote_multiclass,
    lightgbm_plot_confusion_matrices,
    train_lightgbm_multiclass_extra
)


Esperimento con modello LightGBM e secondo set di Features

In [3]:
csv_file_path_exp1 = 'C:/Users/frees/Documents/GitHub/UnderwaterClassification/features_extraction/exp1_audio_features.csv'
df_exp1 = pd.read_csv(csv_file_path_exp1)

In [4]:
df_exp1['File Name'] = df_exp1['File Name'].str[:-4]
df_exp1 = df_exp1[df_exp1['Class'] == 'Target'].copy()
# Aggiungi una colonna "Parent" che contiene il prefisso dei file per garantire la coesione dei segmenti
df_exp1['Parent'] = df_exp1['File Name'].str.extract(r'^(.*?)(?=_seg)')

# Filtra le subclass con almeno 10 parent distinti
parent_counts = df_exp1.groupby('Subclass')['Parent'].nunique()
subclasses_to_keep = parent_counts[parent_counts >= 10].index
df_exp1 = df_exp1[df_exp1['Subclass'].isin(subclasses_to_keep)].copy()

print(f"Dimensione totale dopo il filtraggio: {df_exp1.shape[0]} campioni")

Dimensione totale dopo il filtraggio: 43089 campioni


In [5]:
X_train_imputed_exp1, X_val_imputed_exp1, X_test_imputed_exp1, y_train_encoded_exp1, y_val_encoded_exp1, y_test_encoded_exp1, subclass_encoder_exp1 = split_dataset(
    df_exp1)


Train size: 33872 (78.61%)
Val size:   4403 (10.22%)
Test size:  4814 (11.17%)

Distribuzione di 'Subclass' nel train:
Subclass
Passengership    8830
Tanker           8744
Tug              8213
Cargo            7802
Vessel            283
Name: count, dtype: int64

Distribuzione di 'Subclass' nel val:
Subclass
Passengership    1291
Tanker           1276
Tug               956
Cargo             862
Vessel             18
Name: count, dtype: int64

Distribuzione di 'Subclass' nel test:
Subclass
Passengership    1607
Tanker           1201
Cargo            1007
Tug               993
Vessel              6
Name: count, dtype: int64


In [6]:
# Applica SMOTE
X_train_resampled_exp1, y_train_resampled_exp1 = apply_smote_multiclass(X_train_imputed_exp1, y_train_encoded_exp1, 1)


Class distribution after SMOTE:
1    8830
2    8830
0    8830
3    8830
4    8830
Name: count, dtype: int64


In [7]:
# Rimuovi le colonne non necessarie per evitare errori nei nomi delle feature
X_val_imputed_exp1 = X_val_imputed_exp1.drop(columns=["Class","File Name","Parent","Subclass"], errors='ignore')
X_test_imputed_exp1= X_test_imputed_exp1.drop(columns=["Class","File Name","Parent","Subclass"], errors='ignore')


In [8]:
lightgbm_model_exp1 = train_lightgbm_multiclass_extra(
    X_train_resampled_exp1, y_train_resampled_exp1,
    X_val_imputed_exp1, y_val_encoded_exp1,
    X_test_imputed_exp1, y_test_encoded_exp1
)

Fitting 1 folds for each of 48 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7140
[LightGBM] [Info] Number of data points in the train set: 48553, number of used features: 28
[LightGBM] [Info] Start training from score -1.611355
[LightGBM] [Info] Start training from score -1.568044
[LightGBM] [Info] Start training from score -1.569527
[LightGBM] [Info] Start training from score -1.601703
[LightGBM] [Info] Start training from score -1.702465

Accuratezza (Val) = 0.8478

=== Report - Validation ===
              precision    recall  f1-score   support

           0     0.7767    0.7343    0.7549       862
           1     0.9113    0.8358    0.8719      1291
           2     0.8304    0.8636    0.8467      1276
           3     0.8508    0.9425    0.8943       956
           4     1.0000    1.0000    1.0000        1

In [None]:
lightgbm_plot_confusion_matrices(
    model=lightgbm_model_exp1,
    X_val=X_val_imputed_exp1,
    y_val_encoded=y_val_encoded_exp1,
    X_test=X_test_imputed_exp1,
    y_test_encoded=y_test_encoded_exp1,
    subclass_encoder=subclass_encoder)