In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.svm import SVC, SVR

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

#accuracy, precision, recall, f1-score
GLOBAL_RESULTS_DIR = "D:/Semillero SOFA/gmm_32_definitivo"
#Datasets Models
DATASETS_DIR = f"{GLOBAL_RESULTS_DIR}/new_models"

# Cargar datos
def extract_df(dis, power, gauss, cov):
    sub_dir = f"{dis}km{power}dBm/{gauss}_gaussians"
    df = pd.read_csv(f"{DATASETS_DIR}/{sub_dir}/models32_gmm_{cov}.csv")
    return df



# Calcular metricas
def calculate_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return r2, rmse, mae
# Acumular resultados
def accumulate_results(results, tipo = "train", r2=None, rmse=None, mae=None):
    results["r2"][tipo].append(r2)
    results["rmse"][tipo].append(rmse)
    results["mae"][tipo].append(mae)

def extract_X_y(data, include_osnr=True):
    """
    Extracst features and target variable from the dataset.

    Args:
        data (pd.DataFrame): The input dataset containing features and target.
        include_osnr (bool): Whether to include the 'osnr' feature in X.
    Returns:
        X (pd.DataFrame): The feature set.
        y (pd.Series): The target variable (spacing).
    """
    # 1. Preparar datos
    data = data.copy()
    if data["osnr"].dtype == 'object':
        data["osnr"] = data["osnr"].str.replace('dB', '').astype(float)
    # Preparar datos
    if include_osnr:
        X = data.drop(["spacing"], axis=1)
    else:
        # Excluir tanto spacing como osnr
        X = data.drop(["spacing", "osnr"], axis=1)
    # Si spacing es categórico (ej: "29GHz"), convertir a numérico
    y = data["spacing"].copy()
    if y.dtype == 'object':
        # Extraer números de strings como "29GHz"
        y = y.str.replace('GHz', '').astype(float)
    return X, y
def scale_features(X, type = "standard"):
    """
    Scales the features using the specified scaling method.

    Args:
        X (pd.DataFrame): The feature set to be scaled.
        type (str): The type of scaling to apply ('standard' or 'minmax').

    Returns:
        X_scaled (pd.DataFrame): The scaled feature set.
    """
    if type == "standard":
        scaler = StandardScaler()
    elif type == "minmax":
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
    else:
        raise ValueError("Unsupported scaling type. Use 'standard' or 'minmax'.")
    
    X_scaled = scaler.fit_transform(X)
    return pd.DataFrame(X_scaled, columns=X.columns)
def initialize_regression_results():
    results = {}
    # Modelo
    results["model"] = None
    results["y_test"] = []
    results["y_pred_test"] = []
    results["mae"] = {"train": [], "test": []}
    results["r2"] = {"train": [], "test": []}
    results["rmse"] = {"train": [], "test": []}
    return results

# TODO: Agregar más modelos si es necesario
def choose_model(model_name):
    if model_name == "DecisionTree":
        model = DecisionTreeRegressor(random_state=42)
    elif model_name == "SVM":
        model = SVR(kernel='rbf')
    else:
        raise ValueError(f"Modelo {model_name} no soportado.")
    return model

def save_regression_results(results, path_file, gaussian, model):
   
    dict_results = {}
    for key, value in results.items():
        if key != 'model' and key != 'y_test' and key != 'y_pred_test':
            dict_results[f"{key}_train"] = np.mean(value['train'])
            dict_results[f"{key}_test"] = np.mean(value['test'])
            dict_results[f"{key}_std_train"] = np.std(value['train'])
            dict_results[f"{key}_std_test"] = np.std(value['test'])
    dict_results['gaussian'] = gaussian
    dict_results['model'] = model

    df_results = pd.DataFrame([dict_results])
    if os.path.exists(path_file):
        current_results = pd.read_csv(path_file)
    else:
        current_results = pd.DataFrame()
        
    current_results = pd.concat([current_results, df_results], ignore_index=True)
    current_results.to_csv(path_file, index=False)


def configurar_logs(output_dir, timestamp):
    os.makedirs(output_dir, exist_ok=True)
    log_file = os.path.join(output_dir, f'log_{timestamp}.txt')
    def log(msg):
        print(msg)
        with open(log_file, "a") as f:
            f.write(msg + "\n")
    return log, log_file





In [35]:


PARAMS_GRID = {
    'DecisionTree': {
        'max_depth': [5, 10, 15, 20, None],
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 1, 0.1],# si se requiere ampliar rango, elimino el parametro que sobra
        
    }
}

def train_test_model(data, model_name, include_osnr=True):
    """
    Entrena un modelo de regresión Decision Tree para predecir spacing
    
    Args:
        data: DataFrame con los datos
        model_name: Nombre del modelo a utilizar
        include_osnr: Si incluir OSNR como feature o no
    
    Returns:
        dict: métricas del modelo y modelo entrenado
    """
    results = initialize_regression_results()
    # 1. Preparar datos
    X, y = extract_X_y(data, include_osnr=include_osnr)
    #X_scaled = scale_features(X, type="standard")
    n_splits = 5
    
    # Convertir y a bins para estratificación
    y_bins = LabelEncoder().fit_transform(y)
    # Crear objeto para estratificación
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    #sin estratificar:
    # kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    #for train_index, test_index in kf.split(X_scaled, y):

    # Extraer parametros para el modelo
    params = PARAMS_GRID[model_name]
    for train_index, test_index in skf.split(X, y_bins):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        scaler = StandardScaler().fit(X_train) # Ajustar scaler solo con datos de entrenamiento
        # Escalar features utilizando solo datos de train
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        # 3. Definir modelo

        # # TODO: Probar con SVM. Variar parámetros.
        # svm_model = GridSearchCV(estimator=SVC(random_state=42, kernel='rbf'),
        #                          param_grid=grid_svm, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
        
        # dt_model = GridSearchCV(estimator=DecisionTreeRegressor(random_state=42),
        #                          param_grid=grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
        model = choose_model(model_name)

        model = GridSearchCV(estimator=model,
                             param_grid=params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
        
        #log(best_model.best_params_.values()) # Imprimir mejores parámetros
        # 2 crosvalidacion - 1.
        ###################### ttttttt
        ################ tttt
        # 4. Entrenar modelo
        model.fit(X_train, y_train)
        print(model.best_params_.values())
        # 5. Evaluar modelo
        # Predicciones
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        # Métricas
        train_r2, train_rmse, train_mae = calculate_metrics(y_train, y_pred_train)
        test_r2, test_rmse, test_mae = calculate_metrics(y_test, y_pred_test)
        # Resultados
        # Acumular resultados
        accumulate_results(results, "train", train_r2, train_rmse, train_mae)
        accumulate_results(results, "test", test_r2, test_rmse, test_mae)
        results["y_test"].extend(y_test)
        results["y_pred_test"].extend(y_pred_test)
        results["model"] = model  # Último modelo entrenado

    return results


## Execute

In [36]:
# #=====================================================
# # Cargar dataset
# #=====================================================
# distancias = [0, 270]
# power = [0, 0, 9]
# gaussians = [16,24,32]
# covs = ["diag", "spherical"]
# models = ["SVM", "DecisionTree"]

# dis = 0
# power = 0
# gauss = 16
# cov = "diag"

# database = extract_df(dis, power, gauss, cov)

# model_name = "DecisionTree"  # Cambiar a "DecisionTree" o "RandomForest" según se desee

# results= train_test_model(database, model_name, include_osnr=True)

# save_regression_results(results)

# results_wo =  train_test_model(database, model_name, include_osnr=False)

# save_regression_results(results_wo)


## Full Experiment Execute

In [None]:
#=====================================================
# Crear Logger
#=====================================================
timestamp = datetime.datetime.now().strftime("%d_%H%M")
run_output_dir = os.path.join(GLOBAL_RESULTS_DIR, 'results', f"run_{timestamp}")
os.makedirs(run_output_dir, exist_ok=True)

log, log_file = configurar_logs(run_output_dir, timestamp)

#=====================================================
# Parametros
#=====================================================

distancias = [0, 270]
powers = [0, 0, 9]
gaussians = [16,24,32]
covs = ["diag", "spherical"]
# TODO: Agregar más modelos si es necesario
# - Agregar MLP -> Modelo Multimodal. => informacion en Agregar optuna.
# - Agregar ejecucion segundo plano. tmux. No hup
# - Verificar que agrega sobre logger. Hacer copias de seguridad 
models = ["DecisionTree", "SVM"]

#=====================================================
# Iterar sobre todos los escenarios
#=====================================================
for distancia in distancias:
    for power in powers:
        for gaussian in gaussians:
            for cov in covs:
                database = extract_df(distancia, power, gaussian, cov)
                for model_name in models:
                    #database = extract_df(distancia, power, gaussian, cov)
                    #model_name = model  # Cambiar a "DecisionTree" o "RandomForest" según se desee
                    output_dir = os.path.join(GLOBAL_RESULTS_DIR, 'results', f"{distancia}_{power}",f"run_{timestamp}")
                    os.makedirs(output_dir, exist_ok=True)
                    results= train_test_model(database, model_name, include_osnr=True)
                    filename = os.path.join(output_dir, f'reg_results_w.csv')
                    save_regression_results(results, filename, gaussian, model_name)

                    results_wo =  train_test_model(database, model_name, include_osnr=False)
                    filename = os.path.join(output_dir, f'reg_results_wo.csv')
                    save_regression_results(results_wo, filename, gaussian, model_name)

dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([0.1, 'scale'])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
dict_values([5])
d

KeyboardInterrupt: 