In [8]:
import pandas as pd
import numpy as np
import sys
import os

In [9]:
sys.path.append("../../scripts/")

from feature_selection import forwardFeatureSelection

from NonLinCFA import NonLinCFA
from aux_GenLinCFA import prepare_target_binary
from aux_NonLinCFA import *

Funzioni per gestire l'esportazione delle aggregazioni spaziali.

In [10]:
import pickle
import geopandas as gpd
from shapely.geometry import Point

def parse_coordinates(coord_string):
    # Split the string using underscores as separators
    parts = coord_string.split("_")

    # Extract longitude and latitude from the parts
    try:
        longitude = float(parts[1])
        latitude = float(parts[2])
        return Point(longitude, latitude)
    except (IndexError, ValueError):
        # Handle the case where the string format is incorrect or cannot be parsed
        print("Error parsing coordinates.")
        return None

Salva le informazioni di aggregazione spaziale:

- `outputs` è una lista dove ogni elemento è associato a un bacino
- ogni elemento `output` di `outputs` è una lista di variabili
- `output[i]` è una lista delle componenti della variabile `i`-esima
- `output[i][j]` è una lista di coordinate della `j`-esima componente della variabile `i`-esima

In [18]:
def export_spatial_components(
    basins_names: list[str],
    outputs: list[list[list[list[str]]]],
    variable_names: list[str],
    destination_file: str,
):
    basins_variable_components = {}

    for basin, output in zip(basins_names, outputs):
        basin_variable_components = {}

        for variable_name, variable_components in zip(variable_names, output):
            data = []
            for component_index, coordinates in enumerate(variable_components):
                for coord_string in coordinates:
                    data.append(
                        {
                            "geometry": parse_coordinates(coord_string),
                            "component": component_index,
                        }
                    )

            basin_variable_components[variable_name] = gpd.GeoDataFrame(
                data, geometry="geometry"
            )

        basins_variable_components[basin] = basin_variable_components

    # create parents directories of destination_file if they do not exist
    os.makedirs(os.path.dirname(destination_file), exist_ok=True)

    with open(destination_file, "wb") as handle:
        pickle.dump(basins_variable_components, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
variable_names = [
    "cyclostationary_mean_tg",
    "cyclostationary_mean_tg_1w",
    "cyclostationary_mean_tg_4w",
    "cyclostationary_mean_tg_8w",
    "cyclostationary_mean_tg_12w",
    "cyclostationary_mean_tg_16w",
    "cyclostationary_mean_tg_24w",
    "cyclostationary_mean_rr",
    "cyclostationary_mean_rr_1w",
    "cyclostationary_mean_rr_4w",
    "cyclostationary_mean_rr_8w",
    "cyclostationary_mean_rr_12w",
    "cyclostationary_mean_rr_16w",
    "cyclostationary_mean_rr_24w",
]

In [15]:
plots_folder = "./NonLinCFA/for_plots/internal_ordering/"
path_features = "../../final_features_allcoord/temp_prec/"
path_target = "../../VHI_target/"

# Esegui la NonLinCFA nel modo classico per salvare le componenti spaziali

Liste per memorizzare tutti i risultati relativi a ogni bacino.

In [16]:
outputs = []
basins = []
aggregate_trainvals = []
aggregate_tests = []

In [17]:
for basin in [
    "Adda",
    "Dora",
    "Emiliani1",
    "Emiliani2",
    "Garda_Mincio",
    "Lambro_Olona",
    "Oglio_Iseo",
    "Piemonte_Nord",
    "Piemonte_Sud",
    "Ticino",
]:
    print("####################" + basin + "####################")
    target_df_train, target_df_val, target_df_test, target_df_trainVal = prepare_target(
        "",
        max_train="2010-01-01",
        max_val="2015-01-01",
        max_test="2020-01-01",
        path=path_target + basin + ".csv",
        window_size=1,
    )
    eps = 0.1
    actual_path = path_features + basin + "_aggreg.csv"
    output, aggregate_trainVal, aggregate_test = aggregate_unfolded_data(
        actual_path,
        variable_names,
        target_df_trainVal,
        eps=eps,
        max_train="2010-01-01",
        max_val="2015-01-01",
        max_test="2020-01-01",
    )
    outputs.append(output)
    basins.append(basin)
    aggregate_trainvals.append(aggregate_trainVal)
    aggregate_tests.append(aggregate_test)

####################Adda####################
Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 3

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 5

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 4

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 1

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 5

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 1

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 2

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 1

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 2

Number of features: 92

eps value:  0.0010869565217391304
Number of aggregated features: 1

Number of features: 92

eps value: 

Stampa i bacini elaborati prima di salvarli

In [19]:
print(basins)

['Adda', 'Dora', 'Emiliani1', 'Emiliani2', 'Garda_Mincio', 'Lambro_Olona', 'Oglio_Iseo', 'Piemonte_Nord', 'Piemonte_Sud', 'Ticino']


In [22]:
export_spatial_components(basins, outputs, variable_names, "./variable_components_basins.pickle")

Salva le feature aggregate (quindi prima delle CMI feature selection)

In [24]:
destination_folder = "./NonLinCFA_noCMI/"
os.makedirs(destination_folder, exist_ok=True)

for basin, aggregate_trainval, aggregate_test in zip(
    basins, aggregate_trainvals, aggregate_tests
):
    # Split aggregate_trainval into train (first 411 rows) and validation
    aggregate_train = aggregate_trainval.iloc[:411, :]
    aggregate_val = aggregate_trainval.iloc[411:, :]

    aggregate_train.to_csv(os.path.join(destination_folder, basin + "_train.csv"), index=False)
    aggregate_val.to_csv(os.path.join(destination_folder, basin + "_val.csv"), index=False)
    aggregate_test.to_csv(os.path.join(destination_folder, basin + "_test.csv"), index=False)

# Riesegui la NonLinCFA nel bacino Emiliani1, Emiliani2, GardaMincio

Il codice qui sotto fa l'aggregazione con NonLinCFA dando in input `Emiliani1` + `Emiliani2` + `GardaMincio` e in output il target di:

- `Emiliani1`
- `Emiliani2`
- `GardaMincio`

Per un totale di 3 run.

In [29]:
destination_folder = "./NonLinCFA_e12gm_to_single/"
os.makedirs(destination_folder, exist_ok=True)

Creo l'input `Emiliani1` + `Emiliani2` + `GardaMincio` caricando i dati e concatenandoli per righe, poi lo salvo come se fosse un nuovo "bacino" chiamato `e12gm`.

In [None]:
pd.concat([
    pd.read_csv(path_features+'Emiliani1'+'_aggreg.csv'),
    pd.read_csv(path_features+'Emiliani2'+'_aggreg.csv'),
    pd.read_csv(path_features+'Garda_Mincio'+'_aggreg.csv'),
    ],
    axis = 0
).to_csv(path_features+'e12gm_aggreg.csv', index=False)

Liste per memorizzare tutti i risultati relativi a ogni bacino.

In [30]:
outputs = []
basins = []
aggregate_trainvals = []
aggregate_tests = []

Il loop è su ogni finto bacino `e12gm_to_e1`, `e12gm_to_e2`, `e12gm_to_gm`, ognuno dei quali ha in input `e12gm` e un target diverso.

In [31]:
for basin, target_name in zip(
    ["e12gm_to_e1", "e12gm_to_e2", "e12gm_to_gm"],
    ["Emiliani1", "Emiliani2", "Garda_Mincio"],
):
    print("####################" + basin + "####################")

    target_df_train, target_df_val, target_df_test, target_df_trainVal = prepare_target(
        "",
        max_train="2010-01-01",
        max_val="2015-01-01",
        max_test="2020-01-01",
        path=path_target + target_name + ".csv",  # use the target of Garda_Mincio
        window_size=1,
    )

    eps = 0.1
    actual_path = path_features + "e12gm" + "_aggreg.csv"
    output, aggregate_trainVal, aggregate_test = aggregate_unfolded_data(
        actual_path,
        variable_names,
        target_df_trainVal,
        eps=eps,
        max_train="2010-01-01",
        max_val="2015-01-01",
        max_test="2020-01-01",
    )

    outputs.append(output)
    basins.append(basin)
    aggregate_trainvals.append(aggregate_trainVal)
    aggregate_tests.append(aggregate_test)

    # agg_trainVal_string = plots_folder + basin + "_trainVal_aggreg"
    # agg_test_string = plots_folder + basin + "_test_aggreg"
    # aggregate_trainVal.to_csv(agg_trainVal_string, index = False)
    # aggregate_test.to_csv(agg_test_string, index = False)

    selected_colnames = FS_with_linearWrapper(
        aggregate_trainVal,
        target_df_train,
        target_df_val,
        min(50, aggregate_trainVal.shape[1] - 1),
        228,
    )

    print("\nFull model and selected features with wrapper\n")
    compare_methods(
        aggregate_trainVal,
        aggregate_test,
        target_df_trainVal,
        target_df_test,
        selected_colnames,
    )

    print("\nFull model and best 5 selected features with wrapper\n")
    compare_methods(
        aggregate_trainVal,
        aggregate_test,
        target_df_trainVal,
        target_df_test,
        selected_colnames[0:5],
    )

    # train_string = destination_folder + basin + '_nonLinCFA_wrapper_best5_train.csv'
    # val_string = destination_folder + basin + '_nonLinCFA_wrapper_best5_val.csv'
    # test_string = destination_folder + basin + '_nonLinCFA_wrapper_best5_test.csv'
    # X_train_wrapper = aggregate_trainVal.loc[:410,selected_colnames[0:5]]
    # X_validation_wrapper = aggregate_trainVal.loc[411:,selected_colnames[0:5]]
    # X_train_validation_wrapper = pd.concat([X_train_wrapper, X_validation_wrapper])
    # X_test_wrapper = aggregate_test.loc[:,selected_colnames[0:5]]
    # X_train_wrapper.to_csv(train_string, index=False)
    # X_validation_wrapper.to_csv(val_string, index=False)
    # X_test_wrapper.to_csv(test_string, index=False)

    res = {"delta": [], "numSelected": [], "selectedFeatures": []}

    res["selectedFeatures"] = forwardFeatureSelection(
        10,
        np.array(aggregate_trainVal),
        np.array(target_df_trainVal.mean_std),
        res,
        10,
        1,
    )

    selectedFeatures = "selectedFeatures"
    print(f"\n{res[selectedFeatures]}\n")

    selected_colnames = aggregate_trainVal.columns[res["selectedFeatures"]]

    print("\nFull model and selected features with CMI\n")
    compare_methods(
        aggregate_trainVal,
        aggregate_test,
        target_df_trainVal,
        target_df_test,
        selected_colnames,
    )

    print("\nFull model and best 5 selected features with CMI\n")
    compare_methods(
        aggregate_trainVal,
        aggregate_test,
        target_df_trainVal,
        target_df_test,
        selected_colnames[0:5],
    )

    train_string = destination_folder + basin + "_nonLinCFA_best5_CMI_train.csv"
    val_string = destination_folder + basin + "_nonLinCFA_best5_CMI_val.csv"
    test_string = destination_folder + basin + "_nonLinCFA_best5_CMI_test.csv"

    X_train_CMI5 = aggregate_trainVal.loc[:410, selected_colnames[0:5]]
    X_validation_CMI5 = aggregate_trainVal.loc[411:, selected_colnames[0:5]]
    X_train_validation_CMI5 = pd.concat([X_train_CMI5, X_validation_CMI5])
    X_test_CMI5 = aggregate_test.loc[:, selected_colnames[0:5]]

    selected_colnames_CMI5 = aggregate_trainVal.loc[
        :, selected_colnames[0:5]
    ].columns.values

    X_train_CMI5.to_csv(train_string, index=False)
    X_validation_CMI5.to_csv(val_string, index=False)
    X_test_CMI5.to_csv(test_string, index=False)

    train_string = destination_folder + basin + "_nonLinCFA_CMI_train.csv"
    val_string = destination_folder + basin + "_nonLinCFA_CMI_val.csv"
    test_string = destination_folder + basin + "_nonLinCFA_CMI_test.csv"

    X_train_CMI = aggregate_trainVal.loc[:410, selected_colnames]
    X_validation_CMI = aggregate_trainVal.loc[411:, selected_colnames]
    X_train_validation_CMI = pd.concat([X_train_CMI, X_validation_CMI])
    X_test_CMI = aggregate_test.loc[:, selected_colnames]

    X_train_CMI.to_csv(train_string, index=False)
    X_validation_CMI.to_csv(val_string, index=False)
    X_test_CMI.to_csv(test_string, index=False)

####################e12gm_to_e1####################
Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 8

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 11

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 7

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 7

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 6

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 3

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 3

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 17

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 20

Number of features: 369

eps value:  0.00027100271002710027
Number of aggregated features: 9

Numbe

In [35]:
print(basins)

['e12gm_to_e1', 'e12gm_to_e2', 'e12gm_to_gm']


In [33]:
export_spatial_components(basins, outputs, variable_names, "./variable_components_e12gm_to_single.pickle")

Salva le feature aggregate (quindi prima delle CMI feature selection)

In [34]:
destination_folder = "./NonLinCFA_e12gm_to_single_noCMI/"
os.makedirs(destination_folder, exist_ok=True)

for basin, aggregate_trainval, aggregate_test in zip(
    basins, aggregate_trainvals, aggregate_tests
):
    # Split aggregate_trainval into train (first 411 rows) and validation
    aggregate_train = aggregate_trainval.iloc[:411, :]
    aggregate_val = aggregate_trainval.iloc[411:, :]

    aggregate_train.to_csv(os.path.join(destination_folder, basin + "_train.csv"), index=False)
    aggregate_val.to_csv(os.path.join(destination_folder, basin + "_val.csv"), index=False)
    aggregate_test.to_csv(os.path.join(destination_folder, basin + "_test.csv"), index=False)