# Notebook de Selección de Características (Feature Selection)

Este notebook implementa una variedad de técnicas de selección de características solicitadas, aplicadas al fichero `cleaned_main_financial_metrics.csv`.

**Aviso Importante:** La lista proporcionada es extremadamente extensa y académica.
* **Implementados:** Se implementarán los métodos más comunes y accesibles que se encuentran en las librerías estándar de Python (`scikit-learn`, `scipy`).
* **No Implementados:** Muchos métodos (ej. Bi-normal separation, TNoM, GRASP, VNS, Scatter Search, ACO, PSO, Algoritmos de Estimación de Distribución, etc.) son muy especializados, no están en `scikit-learn` y requerirían implementaciones personalizadas complejas o librerías de nicho. Se dejará constancia de ellos.


## 0. Importar Librerías

In [9]:
import pandas as pd
import numpy as np
import time
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

## 1. Carga y Preparación de Datos

In [10]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder


TARGET_VARIABLE = 'recommendationClass'
COLUMNS_TO_DROP = [
    'recommendationKey', 'recommendationMean', 'state',
    'shortName', 'symbol', 'Ticker', 'fullExchangeName', 'twoHundredDayAverage',
]

df = pd.read_csv('../cleaned_main_financial_metrics.csv')


# df=df[df['numberOfAnalystOpinions']>4].copy()
df=df[df['enterpriseValue']>0].copy()
print(df.shape)

ALL_COLS = df.columns.tolist()
FEATURE_COLS = [col for col in ALL_COLS if col not in COLUMNS_TO_DROP and col != TARGET_VARIABLE]

NUMERIC_FEATURES = df[FEATURE_COLS].select_dtypes(include=['float64']).columns.tolist()
CATEGORICAL_FEATURES = df[FEATURE_COLS].select_dtypes(include=['object', 'bool', 'int32', 'int64']).columns.tolist()

existing_cols_to_drop = [col for col in COLUMNS_TO_DROP if col in df.columns]
df_processed = df.drop(columns=existing_cols_to_drop)

y = df_processed[TARGET_VARIABLE].copy()
X = df_processed.drop(columns=[TARGET_VARIABLE]).copy()

le = LabelEncoder()
y_encoded = le.fit_transform(y).copy()
print(f"Clases del objetivo: {list(le.classes_)}")


numeric_features = X[NUMERIC_FEATURES].columns.tolist()
categorical_features = X[CATEGORICAL_FEATURES].columns.tolist()

print(f"Características numéricas ({len(numeric_features)}): {numeric_features}")
print(f"Características categóricas ({len(categorical_features)}): {categorical_features}")

for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])    


(1498, 56)
Clases del objetivo: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
Características numéricas (43): ['numberOfAnalystOpinions', 'currentPrice', 'allTimeHigh', 'allTimeLow', 'beta', 'averageVolume', 'marketCap', 'enterpriseValue', 'priceToBook', 'enterpriseToRevenue', 'profitMargins', 'grossMargins', 'ebitdaMargins', 'operatingMargins', 'returnOnAssets', 'returnOnEquity', 'totalRevenue', 'revenuePerShare', 'grossProfits', 'ebitda', 'netIncomeToCommon', 'trailingEps', 'totalCash', 'totalCashPerShare', 'totalDebt', 'quickRatio', 'currentRatio', 'bookValue', 'operatingCashflow', 'freeCashflow', 'trailingAnnualDividendYield', 'payoutRatio', 'sharesOutstanding', 'floatShares', 'sharesShort', 'sharesPercentSharesOut', 'shortRatio', 'shortPercentOfFloat', 'heldPercentInsiders', 'heldPercentInstitutions', 'fullTimeEmployees', '_debtToEquity', '_PER']
Características categóricas (5): ['sector', 'In_SP500', 'In_NASDAQ', 'is_Insolvent', 'has_benefits']


In [11]:
# --- Split train/test estratificado ---
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# X_train =  X
# y_train = y_encoded

# X_test=X
# y_text=y_encoded

train = X_train.copy()
train['target'] = y_train

test = X_test.copy()
test['target'] = y_test

full_dataset = pd.concat([train, test], axis=0)

# train.to_csv(r'WEKA_datasets/train_main_financial_metrics.csv', index=False)
# test.to_csv(r'WEKA_datasets/test_main_financial_metrics.csv', index=False)
# full_dataset.to_csv(r'WEKA_datasets/full_main_financial_metrics.csv', index=False)

In [12]:
# # --- Seleccionar columnas categóricas para One-Hot ---
# encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

# cat_cols = [
#     col for col in df[FEATURE_COLS].select_dtypes(include=['object', 'int32', 'int64']).columns
#     if col in X_train.columns and (
#         X_train[col].nunique() > 2 or
#         (X_train[col].nunique() == 2 and set(X_train[col].dropna().unique()) != {0, 1})
#     )
# ]

# binary_cat_cols = [col for col in X_train.columns if col not in cat_cols and X_train[col].nunique() == 2]

# # --- One-Hot encode -----------------------------------------------------
# X_train_encoded_cat = pd.DataFrame(
#     encoder.fit_transform(X_train[cat_cols]),
#     columns=encoder.get_feature_names_out(cat_cols),
#     index=X_train.index
# )

# X_test_encoded_cat = pd.DataFrame(
#     encoder.transform(X_test[cat_cols]),
#     columns=encoder.get_feature_names_out(cat_cols),
#     index=X_test.index
# )

# processed_cat_columns = X_train_encoded_cat.columns.tolist() + binary_cat_cols

# X_train_categorical = pd.concat([X_train[binary_cat_cols], X_train_encoded_cat], axis=1)
# X_test_categorical = pd.concat([X_test[binary_cat_cols], X_test_encoded_cat], axis=1)

# processed_cat_columns = CATEGORICAL_FEATURES.copy()

# X_train_categorical = X_train[processed_cat_columns].copy()
# X_test_categorical = X_test[processed_cat_columns].copy()



In [None]:
# --- STANDARD SCALER -----------------------------------------------------
scaler_std = StandardScaler()
scaler_std.fit(X_train[NUMERIC_FEATURES])

X_train_std = X_train.copy()
X_train_std[NUMERIC_FEATURES] = scaler_std.transform(X_train[NUMERIC_FEATURES]).copy()

X_test_std = X_test.copy()
X_test_std[NUMERIC_FEATURES] = scaler_std.transform(X_test[NUMERIC_FEATURES]).copy()

# Añadir target
train_std = X_train_std.copy()
test_std = X_test_std.copy()
train_std["target"] = y_train
test_std["target"] = y_test

# Guardar CSV StandardScaler
# train_std.to_csv(r"WEKA_datasets/train_std_main_financial_metrics.csv", index=False)
# test_std.to_csv(r"WEKA_datasets/test_std_main_financial_metrics.csv", index=False)

# print("✅ Archivos guardados con StandardScaler")
# print("train_std_main_financial_metrics.csv")
# print("test_std_main_financial_metrics.csv")

# --- MIN-MAX SCALER -----------------------------------------------------

scaler_minmax = MinMaxScaler()
scaler_minmax.fit(X_train[NUMERIC_FEATURES])

X_train_minmax = X_train.copy()
X_train_minmax[NUMERIC_FEATURES] = scaler_minmax.transform(X_train[NUMERIC_FEATURES]).copy()

X_test_minmax = X_test.copy()
X_test_minmax[NUMERIC_FEATURES] = scaler_minmax.transform(X_test[NUMERIC_FEATURES]).copy()

# Añadir target
train_minmax = X_train_minmax.copy()
test_minmax = X_test_minmax.copy()
train_minmax["target"] = y_train
test_minmax["target"] = y_test

# Guardar CSV MinMaxScaler
# train_minmax.to_csv(r"WEKA_datasets/train_minmax_main_financial_metrics.csv", index=False)
# test_minmax.to_csv(r"WEKA_datasets/test_minmax_main_financial_metrics.csv", index=False)

# print("✅ Archivos guardados con MinMaxScaler")
# print(r"WEKA_datasets/train_minmax_main_financial_metrics.csv")
# print(r"WEKA_datasets/test_minmax_main_financial_metrics.csv")

✅ Archivos guardados con StandardScaler
train_std_main_financial_metrics.csv
test_std_main_financial_metrics.csv


In [14]:
from imblearn.over_sampling import SMOTE

# --- SMOTE sobre train (con StandardScaler) ---
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(
    train.drop(columns=["target"]),
    train["target"]
)

X_train_std_balanced, y_train_std_balanced = smote.fit_resample(
    train_std.drop(columns=["target"]),
    train_std["target"]
)

train_balanced_df = X_train_balanced.copy()
train_balanced_df["target"] = y_train_balanced

train_std_balanced_df = X_train_std_balanced.copy()
train_std_balanced_df["target"] = y_train_std_balanced

# Guardar CSV reequilibrado
display(train_balanced_df["target"].value_counts())
train_balanced_df.to_csv(r"WEKA_datasets/train_bal_main_finantial_metrics.csv", index=False)
train_std_balanced_df.to_csv(r"WEKA_datasets/train_std_bal_main_finantial_metrics.csv", index=False)

print("✅ Train reequilibrado con SMOTE guardado: train_std_bal_main_finantial_metrics.csv")

target
2    365
3    365
1    365
4    365
0    365
Name: count, dtype: int64

✅ Train reequilibrado con SMOTE guardado: train_std_bal_main_finantial_metrics.csv


In [15]:
# Definir conjuntos finales para modelado. Esta vez usaré std scaler
X_train = X_train_std.copy()
X_test = X_test_std.copy()
y_train = train_std['target'].copy()
y_test = test_std['target'].copy()


# Eliminar variables del namespace
del X_train_std, X_test_std
del X_train_minmax, X_test_minmax
del train_std, test_std
del train_minmax, test_minmax

# Forzar recolección de basura
import gc
gc.collect()


939

In [16]:
import numpy as np
import pandas as pd

def apply_best_transformations(df, y, transformations):
    """
    Aplica transformaciones (Log, Outlier Clipping) a un DataFrame 
    basado en un diccionario de métodos predefinido.

    Args:
        df (pd.DataFrame): El DataFrame original y sin procesar.
        transformations (dict): Un diccionario que mapea nombres de columnas
                                al método de transformación ('RAW', 'OUTLIERS', 'LOG').

    Returns:
        pd.DataFrame: Un nuevo DataFrame con las transformaciones aplicadas.
    """
    # Crear una copia para evitar modificar el DataFrame original
    processed_df = df.copy()
    
    # Conjunto para almacenar todos los índices de filas que se marcarán para eliminación
    indices_to_drop = set()

    print(f"Aplicando transformaciones a {len(transformations)} features...")

    for col, method in transformations.items():
        # Asegurarse de que la columna del diccionario existe en el DF
        if col not in processed_df.columns:
            print(f"Warning: La columna '{col}' del diccionario no está en el DataFrame. Omitiendo.")
            continue

        try:
            if method == 'LOG':
                # Replicar la lógica exacta del script de ploteo
                min_val = processed_df[col].min()
                
                if pd.isna(min_val):
                    print(f" - {col}: Omitido (todos NaN).")
                    continue

                if min_val > 0:
                    processed_df[col] = np.log(processed_df[col])
                    print(f" - {col}: Aplicado np.log(x)")
                elif min_val >= 0:
                    processed_df[col] = np.log1p(processed_df[col])
                    print(f" - {col}: Aplicado np.log1p(x)")
                else:
                    # Aplicar shift para valores negativos
                    shift = abs(min_val) + 1
                    processed_df[col] = np.log(processed_df[col] + shift)
                    print(f" - {col}: Aplicado np.log(x + {shift:.1f})")

            elif method == 'CLIPPED':
                # --- LÓGICA MODIFICADA ---
                # Ya no se hace .clip(). Ahora se marcan las filas para borrarlas al final.
                
                # 1. Replicar el cálculo de cuantiles
                q_low, q_high = processed_df[col].quantile(0.005), processed_df[col].quantile(0.995)
                
                if pd.isna(q_low) or pd.isna(q_high):
                    print(f" - {col}: Omitido (no se pudieron calcular cuantiles, ¿NaNs?).")
                    continue
                
                # 2. Encontrar los índices de las filas que están FUERA de los límites
                outlier_indices = processed_df[
                    (processed_df[col] < q_low) | (processed_df[col] > q_high)
                ].index
                
                # 3. Añadir esos índices al conjunto de filas para eliminar
                indices_to_drop.update(outlier_indices)
                
                if len(outlier_indices) > 0:
                    print(f" - {col}: Marcadas {len(outlier_indices)} filas como outliers (fuera de [{q_low:.2f}, {q_high:.2f}])")
                else:
                    print(f" - {col}: No se encontraron outliers (rango [{q_low:.2f}, {q_high:.2f}])")

            elif method == 'ORIGINAL':
                # No se requiere transformación
                print(f" - {col}: Mantenido como 'ORIGINAL'.")
                pass
            
        except Exception as e:
            print(f"Error transformando la columna '{col}' con método '{method}': {e}. Omitiendo.")

    # --- PASO FINAL: Eliminar todas las filas marcadas ---
    if indices_to_drop:
        print(f"\nEliminando un total de {len(indices_to_drop)} filas únicas marcadas como outliers...")
        processed_df = processed_df.drop(index=list(indices_to_drop))
        y = y.drop(index=list(indices_to_drop))
        print(f"Tamaño final del DataFrame: {len(processed_df)} filas.")
    else:
        print("\nNo se eliminaron filas por outliers.")

    print("Transformaciones completadas.")
    return processed_df, y


In [17]:
best_transformations = {'numberOfAnalystOpinions': 'ORIGINAL', 'currentPrice': 'LOG', 'allTimeHigh': 'LOG', 'allTimeLow': 'LOG', 'beta': 'CLIPPED', 'averageVolume': 'LOG', 'marketCap': 'LOG', 'enterpriseValue': 'LOG', 'priceToBook': 'LOG', 'enterpriseToRevenue': 'LOG', 'profitMargins': 'LOG', 'grossMargins': 'CLIPPED', 'ebitdaMargins': 'CLIPPED', 'operatingMargins': 'CLIPPED', 'returnOnAssets': 'ORIGINAL', 'returnOnEquity': 'CLIPPED', 'totalRevenue': 'LOG', 'revenuePerShare': 'LOG', 'grossProfits': 'ORIGINAL', 'ebitda': 'ORIGINAL', 'netIncomeToCommon': 'ORIGINAL', 'trailingEps': 'CLIPPED', 'totalCash': 'LOG', 'totalCashPerShare': 'LOG', 'totalDebt': 'LOG', 'quickRatio': 'LOG', 'currentRatio': 'LOG', 'bookValue': 'LOG', 'operatingCashflow': 'CLIPPED', 'freeCashflow': 'CLIPPED', 'trailingAnnualDividendYield': 'CLIPPED', 'payoutRatio': 'LOG', 'sharesOutstanding': 'LOG', 'floatShares': 'LOG', 'sharesShort': 'LOG', 'sharesPercentSharesOut': 'LOG', 'shortRatio': 'LOG', 'shortPercentOfFloat': 'LOG', 'heldPercentInsiders': 'CLIPPED', 'heldPercentInstitutions': 'CLIPPED', 'fullTimeEmployees': 'LOG', '_debtToEquity': 'LOG', '_PER': 'LOG'}


In [18]:

X_processed , y_processed = apply_best_transformations(X, y, best_transformations)

le = LabelEncoder()
y_proc_encoded = le.fit_transform(y_processed)

# --- Split train/test estratificado ---
X_train_proc, X_test_proc, y_proc_train, y_proc_test = train_test_split(X_processed, y_proc_encoded, test_size=0.2, random_state=42, stratify=y_proc_encoded)

# X_train_proc = X_processed
# y_proc_train = y_proc_encoded

# X_test_proc = X_processed
# y_proc_test = y_proc_encoded

train_proc = X_train_proc.copy()
train_proc['target'] = y_proc_train

test_proc = X_test_proc.copy()
test_proc['target'] = y_proc_test

# --- STANDARD SCALER -----------------------------------------------------
scaler_std = StandardScaler()
scaler_std.fit(X_train_proc[NUMERIC_FEATURES])

X_train_proc_std = X_train_proc.copy()
X_train_proc_std[NUMERIC_FEATURES] = scaler_std.transform(X_train_proc[NUMERIC_FEATURES]).copy()

X_test_proc_std = X_test_proc.copy()
X_test_proc_std[NUMERIC_FEATURES] = scaler_std.transform(X_test_proc[NUMERIC_FEATURES]).copy()

# Añadir target
train_proc_std = X_train_proc_std.copy()
test_proc_std = X_test_proc_std.copy()
train_proc_std["target"] = y_proc_train
test_proc_std["target"] = y_proc_test

# Guardar sin SMOTE
# train_proc_std.to_csv(r"WEKA_datasets/train_proc_std_main_financial_metrics.csv", index=False)

print("✅ Archivos guardados con StandardScaler")
print("train_std_main_financial_metrics.csv")
print("test_std_main_financial_metrics.csv")


# --- SMOTE sobre train (con StandardScaler) ---
smote = SMOTE(random_state=42)
X_train_proc_std_balanced, y_proc_train_balanced = smote.fit_resample(
    train_proc_std.drop(columns=["target"]),
    train_proc_std["target"]
)

train_proc_std_balanced = X_train_proc_std_balanced.copy()
train_proc_std_balanced["target"] = y_proc_train_balanced

# Guardar CSV reequilibrado
display(train_proc_std_balanced["target"].value_counts())
train_proc_std_balanced.to_csv(r"WEKA_datasets/train_processed_std_bal_main_financial_metrics.csv", index=False)

test_proc_std.to_csv(r"WEKA_datasets/test_processed_std_main_financial_metrics.csv", index=False)

Aplicando transformaciones a 43 features...
 - numberOfAnalystOpinions: Mantenido como 'ORIGINAL'.
 - currentPrice: Aplicado np.log(x)
 - allTimeHigh: Aplicado np.log(x)
 - allTimeLow: Aplicado np.log1p(x)
 - beta: Marcadas 16 filas como outliers (fuera de [-0.08, 4.01])
 - averageVolume: Aplicado np.log(x)
 - marketCap: Aplicado np.log(x)
 - enterpriseValue: Aplicado np.log(x)
 - priceToBook: Aplicado np.log(x)
 - enterpriseToRevenue: Aplicado np.log(x)
 - profitMargins: Aplicado np.log(x + 3.9)
 - grossMargins: Marcadas 8 filas como outliers (fuera de [-1.71, 1.00])
 - ebitdaMargins: Marcadas 16 filas como outliers (fuera de [-1.52, 0.90])
 - operatingMargins: Marcadas 16 filas como outliers (fuera de [-139.10, 0.82])
 - returnOnAssets: Mantenido como 'ORIGINAL'.
 - returnOnEquity: Marcadas 16 filas como outliers (fuera de [-1.90, 1.90])
 - totalRevenue: Aplicado np.log(x)
 - revenuePerShare: Aplicado np.log1p(x)
 - grossProfits: Mantenido como 'ORIGINAL'.
 - ebitda: Mantenido como '

target
4    337
1    337
2    337
3    337
0    337
Name: count, dtype: int64