In [1]:
import pandas as pd

import oversampling_binary
import oversampling_multiclass

# CLASSIFICAZIONE MULTICLASSE

In [2]:
# Caricare il dataset e applicare lo split
csv_file_path = 'audio_features.csv'
X_train_imputed, X_val_imputed, X_test_imputed, y_train_encoded, y_val_encoded, y_test_encoded, subclass_encoder = oversampling_multiclass.split_dataset(csv_file_path)

In [3]:
# Stampare la distribuzione delle classi
print("Distribuzione delle classi nel set di addestramento:")
print(pd.Series(subclass_encoder.inverse_transform(y_train_encoded)).value_counts())

Distribuzione delle classi nel set di addestramento:
Tanker                              9181
Passengership                       8847
Tug                                 8539
Cargo                               7705
Vessel                               282
Seismic airguns surveys               18
Seal bomb                             17
Sonar                                 16
Echosounder                            7
Underwater Communication Signals       6
Scuba divers                           4
Seafloor fossil fuel processing        4
Acoustic release                       3
Acoustic Harassment Devices            2
Explosion                              2
Fishing pinger                         2
Name: count, dtype: int64


In [4]:
print("\nContenuto del set di addestramento prima di SMOTE:")
train_df_before_smote = pd.DataFrame(X_train_imputed, columns=[f'Feature_{i}' for i in range(X_train_imputed.shape[1])])
train_df_before_smote['Subclass'] = subclass_encoder.inverse_transform(y_train_encoded)
display(train_df_before_smote)


Contenuto del set di addestramento prima di SMOTE:


Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Subclass
0,4477.358083,4252.395919,0.052583,-0.065835,23.613643,3.186076,-371.23227,23776.0,0.698419,Acoustic Harassment Devices
1,2623.478284,2626.760521,0.041744,-0.095743,81.076911,2.165237,-475.12302,13548.0,0.815664,Acoustic Harassment Devices
2,3509.880580,5613.858656,0.060189,1.121935,145.730186,2.333120,-582.94653,1912.0,0.815846,Acoustic release
3,2521.859074,5201.373090,0.067112,0.740150,134.817244,2.341261,-581.94150,4383.0,0.757898,Acoustic release
4,1209.973836,2659.503763,0.043428,1.686207,264.717755,1.846858,-595.79376,1761.0,0.909339,Acoustic release
...,...,...,...,...,...,...,...,...,...,...
34630,456.747011,797.837269,0.194804,0.013841,0.040806,4.912163,-420.84280,3257.0,0.502427,Vessel
34631,452.009912,804.062723,0.187102,0.015867,0.072808,4.923598,-419.22150,3284.0,0.499966,Vessel
34632,446.388360,808.959976,0.179071,0.000758,0.094906,4.904681,-423.88705,3247.0,0.500477,Vessel
34633,451.316540,794.817133,0.188441,-0.002943,0.136695,4.846811,-406.76170,3283.0,0.495841,Vessel


In [5]:
print("\nDistribuzione delle classi nel set di validazione:")
print(pd.Series(subclass_encoder.inverse_transform(y_val_encoded)).value_counts())


Distribuzione delle classi nel set di validazione:
Passengership    1010
Cargo             923
Tanker            911
Tug               560
Vessel             20
Echosounder         3
Name: count, dtype: int64


In [6]:
print("\nDistribuzione delle classi nel set di test:")
print(pd.Series(subclass_encoder.inverse_transform(y_test_encoded)).value_counts())


Distribuzione delle classi nel set di test:
Passengership    1871
Tanker           1129
Tug              1063
Cargo            1043
Vessel              5
Echosounder         4
Sonar               4
Name: count, dtype: int64


In [7]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # Adatta questo valore al numero di core che desideri utilizzare

In [8]:
# Applicare SMOTE
X_train_resampled, y_train_resampled = oversampling_multiclass.apply_smote_multiclass(X_train_imputed, y_train_encoded, 4)

Attenzione: Il numero di vicini (4) è maggiore del numero di campioni nella classe meno rappresentata (2). Ridurre k_neighbors.
Distribuzione delle classi nel set di training dopo SMOTE:
0     9181
1     9181
2     9181
3     9181
4     9181
5     9181
6     9181
7     9181
8     9181
9     9181
10    9181
11    9181
12    9181
13    9181
14    9181
15    9181
Name: count, dtype: int64


In [9]:
# Stampare la distribuzione delle classi dopo SMOTE
print("\nDistribuzione delle classi nel set di training dopo SMOTE:")
print(pd.Series(subclass_encoder.inverse_transform(y_train_resampled)).value_counts())


Distribuzione delle classi nel set di training dopo SMOTE:
Acoustic Harassment Devices         9181
Acoustic release                    9181
Cargo                               9181
Echosounder                         9181
Explosion                           9181
Fishing pinger                      9181
Passengership                       9181
Scuba divers                        9181
Seafloor fossil fuel processing     9181
Seal bomb                           9181
Seismic airguns surveys             9181
Sonar                               9181
Tanker                              9181
Tug                                 9181
Underwater Communication Signals    9181
Vessel                              9181
Name: count, dtype: int64


In [10]:
# Stampare tutto il set di addestramento dopo SMOTE
print("\nContenuto del set di addestramento dopo SMOTE:")
train_df_after_smote = pd.DataFrame(X_train_resampled, columns=[f'Feature_{i}' for i in range(X_train_resampled.shape[1])])
train_df_after_smote['Subclass'] = subclass_encoder.inverse_transform(y_train_resampled)
display(train_df_after_smote)


Contenuto del set di addestramento dopo SMOTE:


Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Subclass
0,4477.358083,4252.395919,0.052583,-0.065835,23.613643,3.186076,-371.232270,23776.000000,0.698419,Acoustic Harassment Devices
1,2623.478284,2626.760521,0.041744,-0.095743,81.076911,2.165237,-475.123020,13548.000000,0.815664,Acoustic Harassment Devices
2,3509.880580,5613.858656,0.060189,1.121935,145.730186,2.333120,-582.946530,1912.000000,0.815846,Acoustic release
3,2521.859074,5201.373090,0.067112,0.740150,134.817244,2.341261,-581.941500,4383.000000,0.757898,Acoustic release
4,1209.973836,2659.503763,0.043428,1.686207,264.717755,1.846858,-595.793760,1761.000000,0.909339,Acoustic release
...,...,...,...,...,...,...,...,...,...,...
146891,4069.548725,4810.489456,0.055798,0.058212,1.015592,3.827063,-237.869733,15638.560686,0.543016,Vessel
146892,1805.872086,4061.731231,0.015967,-0.020603,-0.020222,4.821025,-308.643320,0.000000,1.000000,Vessel
146893,11820.275264,11540.016909,0.043590,0.018912,2.751272,3.600472,-160.830469,54021.148340,0.363999,Vessel
146894,2456.658173,4982.190082,0.075606,-0.012942,0.211998,4.769933,-192.790204,0.281760,0.999999,Vessel


In [11]:
# Stampare le forme dei dataset
print("\nForme dei dataset:")
print(f"X_train_imputed: {X_train_imputed.shape}")
print(f"X_val_imputed: {X_val_imputed.shape}")
print(f"X_test_imputed: {X_test_imputed.shape}")
print(f"X_train_resampled: {X_train_resampled.shape}")


Forme dei dataset:
X_train_imputed: (34635, 9)
X_val_imputed: (3427, 9)
X_test_imputed: (5119, 9)
X_train_resampled: (146896, 9)


In [None]:
random_forest_model = oversampling_multiclass.train_random_forest_multiclass(X_train_resampled, y_train_resampled, X_val_imputed, y_val_encoded, 10)

Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Fitting 2 folds for each of 24 candidates, totalling 48 fits


Training Progress:  10%|█         | 1/10 [00:48<07:16, 48.47s/it]

Fitting 2 folds for each of 24 candidates, totalling 48 fits


Training Progress:  20%|██        | 2/10 [02:53<12:26, 93.28s/it]

Fitting 2 folds for each of 24 candidates, totalling 48 fits


Training Progress:  30%|███       | 3/10 [06:16<16:44, 143.43s/it]

Fitting 2 folds for each of 24 candidates, totalling 48 fits
