In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVC, SVR
import tqdm

import matplotlib.pyplot as plt
import os
import datetime
import logging
import shutil
import json

#accuracy, precision, recall, f1-score
GLOBAL_RESULTS_DIR = "D:/Semillero SOFA/gmm_32_definitivo"
#Datasets Models
DATASETS_DIR = f"{GLOBAL_RESULTS_DIR}/new_models"

# Cargar datos
def extract_df(dis, power, gauss, cov):
    sub_dir = f"{dis}km{power}dBm/{gauss}_gaussians"
    df = pd.read_csv(f"{DATASETS_DIR}/{sub_dir}/models32_gmm_{cov}.csv")
    return df



# Calcular metricas
def calculate_regression_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return r2, rmse, mae
# Acumular resultados
def accumulate_regression_results(results, tipo = "train", r2=None, rmse=None, mae=None):
    results["r2"][tipo].append(r2)
    results["rmse"][tipo].append(rmse)
    results["mae"][tipo].append(mae)

def extract_X_y_regression(data, include_osnr=True):
    """
    Extracst features and target variable from the dataset.

    Args:
        data (pd.DataFrame): The input dataset containing features and target.
        include_osnr (bool): Whether to include the 'osnr' feature in X.
    Returns:
        X (pd.DataFrame): The feature set.
        y (pd.Series): The target variable (spacing).
    """
    # 1. Preparar datos
    data = data.copy()
    if data["osnr"].dtype == 'object':
        data["osnr"] = data["osnr"].str.replace('dB', '').astype(float)
    # Preparar datos
    if include_osnr:
        X = data.drop(["spacing"], axis=1)
    else:
        # Excluir tanto spacing como osnr
        X = data.drop(["spacing", "osnr"], axis=1)
    # Si spacing es categórico (ej: "29GHz"), convertir a numérico
    y = data["spacing"].copy()
    if y.dtype == 'object':
        # Extraer números de strings como "29GHz"
        y = y.str.replace('GHz', '').astype(float)
    return X, y
def scale_features(X, type = "standard"):
    """
    Scales the features using the specified scaling method.

    Args:
        X (pd.DataFrame): The feature set to be scaled.
        type (str): The type of scaling to apply ('standard' or 'minmax').

    Returns:
        X_scaled (pd.DataFrame): The scaled feature set.
    """
    if type == "standard":
        scaler = StandardScaler()
    elif type == "minmax":
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported scaling type. Use 'standard' or 'minmax'.")
    
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)
def initialize_regression_results():
    results = {}
    # Modelo
    results["model_params"] = {}
    results["y_test"] = []
    results["y_pred_test"] = []
    results["mae"] = {"train": [], "test": []}
    results["r2"] = {"train": [], "test": []}
    results["rmse"] = {"train": [], "test": []}
    return results

# TODO: Agregar más modelos si es necesario
def choose_model_regression(model_name):
    """
    Choose and return a regression model.
    
    Note: For MLP models with Optuna optimization, use train_test_mlp_optuna()
    from train_test_optuna.py instead of this function.
    """
    if model_name == "DecisionTree":
        model = DecisionTreeRegressor(random_state=42)
    elif model_name == "SVM":
        model = SVR(kernel='rbf')
    elif model_name == "RandomForest":
        model = RandomForestRegressor(random_state=42)
    else:
        raise ValueError(f"Modelo {model_name} no soportado.")
    return model

def save_regression_results(results, path_file, gaussian, covariance, model, logger):
   
    dict_results = {}
    metrics = ['mae', 'r2', 'rmse']
    for key, value in results.items(): # Iterate over the metrics
        if key in metrics:
            dict_results[f"{key}_train"] = np.mean(value['train'])
            dict_results[f"{key}_test"] = np.mean(value['test'])
            dict_results[f"{key}_std_train"] = np.std(value['train'])
            dict_results[f"{key}_std_test"] = np.std(value['test'])
    dict_results['gaussian'] = gaussian
    dict_results['covariance'] = covariance
    dict_results['model_name'] = model

    df_results = pd.DataFrame([dict_results])
    if os.path.exists(path_file):
        current_results = pd.read_csv(path_file)
    else:
        current_results = pd.DataFrame()
        
    current_results = pd.concat([current_results, df_results], ignore_index=True)
    current_results.to_csv(path_file, index=False)
    log_msg = f"Saved regression {gaussian} gaussians, {covariance} covariance, model {model} results to {path_file}"
    logger.info(log_msg)

def save_regression_results_detailed(results, path_file, gaussian, covariance, model, logger):
    """
    Save raw results (without averaging) for each fold)
    It also include the best params for each fold.
    Save in a JSON file. 
    It will save in a specific folder according to the gaussians and covariance type.
    TODO: Save in a compressed format and backup if it's necessary

        Args:
            results (dict): The results dictionary containing metrics and model parameters.
            path_file (str): The file path to save the detailed results.
            gaussian (int): The number of gaussians used in the model.
            covariance (str): The type of covariance used in the model.
            model (str): The name of the regression model used.
    """
    

    # Check if the file exists
    if not os.path.exists(path_file):
        dict_results = {
            str(gaussian): {
                covariance:{
                    model: {
                    'metrics': {
                        'mae': results['mae'],
                        'r2': results['r2'],
                        'rmse': results['rmse'],
                    },
                    'model_params': results['model_params'],
                    }
                }
            }
        }
    else:
        # If the file exists, save back up and load the existing results
        backup_path = path_file + ".bak"
        shutil.copy2(path_file, backup_path)

        with open(path_file, "r") as f:
            dict_results = json.load(f)
        # If gaussian type does not exist, create it
        if str(gaussian) not in dict_results:
            dict_results[str(gaussian)] = {}
        # If covariance type does not exist, create it
        if covariance not in dict_results[str(gaussian)]:
            dict_results[str(gaussian)][covariance] = {}
        # Update the results for the specific fold
        dict_results[str(gaussian)][covariance][model] = {
            'metrics': {
                'mae': results['mae'],
                'r2': results['r2'],
                'rmse': results['rmse'],
            },
            'model_params': results['model_params'],
        }

    # Save the updated results
    with open(path_file, "w") as f:
        json.dump(dict_results, f, indent=4)
    log_msg = f"Saved detailed regression {gaussian} gaussians, {covariance} covariance, model {model} results to {path_file}"
    logger.info(log_msg)

def setup_logger(name: str) -> logging.Logger:
    # Setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),  # Log to console
            logging.FileHandler(f"{name}.log")
        ]
    )
    logger = logging.getLogger(name)
    return logger





In [7]:

# Parámetros para GridSearchCV (se utiliza solo en funcion train_test_regression_model)
PARAMS_GRID_REGRESSION = {
    'DecisionTree': {
        'max_depth': [5, 10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.1, 1],
        'kernel': ['rbf']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
}


def train_test_regression_model(data, model_name, logger, include_osnr=True):
    """
    Entrena un modelo de regresión Decision Tree para predecir spacing
    
    Args:
        data: DataFrame con los datos
        model_name: Nombre del modelo a utilizar
        include_osnr: Si incluir OSNR como feature o no
    
    Returns:
        dict: métricas del modelo y modelo entrenado
    """
    results = initialize_regression_results()
    # 1. Preparar datos
    X, y = extract_X_y_regression(data, include_osnr=include_osnr)
    #X_scaled = scale_features(X, type="standard")
    n_splits = 5
    

    # TODO: Accoding to GPT: StratifiedKFold stratification: 
    # using LabelEncoder().fit_transform(y) on continuous spacing is not appropriate — 
    # it uses unique continuous values and will not produce useful strata (or will fail if few repeats).
    #  You should bin y (e.g. pd.qcut or pd.cut) before stratifying.
    
    # Convertir y a bins para estratificación
    y_bins = LabelEncoder().fit_transform(y)
    # Crear objeto para estratificación
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    #sin estratificar:
    # kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    #for train_index, test_index in kf.split(X_scaled, y):

    # Extraer parametros para el modelo
    params = PARAMS_GRID_REGRESSION[model_name]
    for index, (train_index, test_index) in enumerate(skf.split(X, y_bins)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        scaler = StandardScaler().fit(X_train) # Ajustar scaler solo con datos de entrenamiento
        # Escalar features utilizando solo datos de train
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        # 3. Definir modelo


        model = choose_model_regression(model_name)

        model = GridSearchCV(estimator=model,
                             param_grid=params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
        
        #log(best_model.best_params_.values()) # Imprimir mejores parámetros
        # 2 crosvalidacion - 1.
        ###################### ttttttt
        ################ tttt
        # 4. Entrenar modelo
        model.fit(X_train, y_train)
        logger.info(str(model.best_params_))
        # 5. Evaluar modelo
        # Predicciones
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        # Métricas
        train_r2, train_rmse, train_mae = calculate_regression_metrics(y_train, y_pred_train)
        test_r2, test_rmse, test_mae = calculate_regression_metrics(y_test, y_pred_test)
        # Resultados
        # Acumular resultados
        accumulate_regression_results(results, "train", train_r2, train_rmse, train_mae)
        accumulate_regression_results(results, "test", test_r2, test_rmse, test_mae)
        results["y_test"].extend(y_test)
        results["y_pred_test"].extend(y_pred_test)
        results["model_params"][index] = model.best_params_  # Save best_params in each fold

    return results


## Execute Simple

In [8]:
# #=====================================================
# # Cargar dataset
# #=====================================================
# distancias = [0, 270]
# power = [0, 0, 9]
# gaussians = [16,24,32]
# covs = ["diag", "spherical"]
# models = ["SVM", "DecisionTree"]

# dis = 0
# power = 0
# gauss = 16
# cov = "diag"

# database = extract_df(dis, power, gauss, cov)

# model_name = "DecisionTree"  # Cambiar a "DecisionTree" o "RandomForest" según se desee

# results= train_test_regression_model(database, model_name, include_osnr=True)

# save_regression_results(results)

# results_wo =  train_test_regression_model(database, model_name, include_osnr=False)

# save_regression_results(results_wo)


## Full Experiment Execute

In [None]:
#=====================================================
# Crear Logger
#=====================================================
timestamp = datetime.datetime.now().strftime("%m_%d_%H%M")
run_output_dir = os.path.join(GLOBAL_RESULTS_DIR, 'results', f"run_{timestamp}")
os.makedirs(run_output_dir, exist_ok=True)

logger = setup_logger(run_output_dir)

#=====================================================
# Parametros
#=====================================================

distancias = [0, 270]
powers = [0, 0, 9]
dist_powers = [(0,0), (270,0), (270,9)]
gaussians = [16,24,32]
covs = ["diag", "spherical"]
# TODO: Agregar más modelos si es necesario
# - Agregar MLP -> Modelo Multimodal. => informacion en Agregar optuna.
# - Agregar ejecucion segundo plano. tmux. No hup
# - Verificar que agrega sobre logger. Hacer copias de seguridad 
# - Agregar hiperparametros a el csv que se guarda
models = ["DecisionTree"]

#=====================================================
# Iterar sobre todos los escenarios (ML models only)
#=====================================================
# Compute total number of ML runs and create progress bar
total_runs = len(dist_powers) * len(gaussians) * len(covs) * len(models)
ml_pbar = tqdm.tqdm(total=total_runs, desc="ML Model Training Progress")
for distancia, power in dist_powers:
    for gaussian in gaussians:
        for cov in covs:
            database = extract_df(distancia, power, gaussian, cov)
            for model_name in models:
                #database = extract_df(distancia, power, gaussian, cov)
                    #model_name = model  # Cambiar a "DecisionTree" o "RandomForest" según se desee
                    output_dir = os.path.join(run_output_dir, f"{distancia}_{power}") # It can be saved in other specific folder
                    os.makedirs(output_dir, exist_ok=True)

                    #=====================================================
                    results= train_test_regression_model(database, model_name, logger, include_osnr=True)
                    filename = os.path.join(output_dir, f'reg_results_w.csv')

                    #This function will save the average results
                    save_regression_results(results, filename, gaussian, cov, model_name, logger)
                    # TODO: ¿Guardar parametros del modelo?
                    filename = os.path.join(output_dir, f'reg_results_w_detailed.json')
                    save_regression_results_detailed(results, filename, gaussian, cov, model_name, logger)

                    #=====================================================
                    # Guardar resultados sin OSNR
                    #=====================================================
                    results_wo =  train_test_regression_model(database, model_name, logger, include_osnr=False)
                    filename = os.path.join(output_dir, f'reg_results_wo.csv')
                    save_regression_results(results_wo, filename, gaussian, cov, model_name, logger)

                    filename = os.path.join(output_dir, f'reg_results_wo_detailed.json')
                    save_regression_results_detailed(results_wo, filename, gaussian, cov, model_name, logger)

                    log_msg = f"Completed {model_name} model for {gaussian} gaussians, {cov} covariance, distance {distancia} km and power {power} dBm."
                    logging.info(log_msg)
                    # Update progress bar after finishing this ML configuration
                    try:
                        ml_pbar.update(1)
                    except Exception:
                        # If progress update fails (rare), continue without stopping the experiment
                        pass
# Close the progress bar when done
try:
    ml_pbar.close()
except Exception:
    pass