# Notebook de Selección de Características (Feature Selection)

Este notebook implementa una variedad de técnicas de selección de características solicitadas, aplicadas al fichero `cleaned_main_financial_metrics.csv`.

**Aviso Importante:** La lista proporcionada es extremadamente extensa y académica.
* **Implementados:** Se implementarán los métodos más comunes y accesibles que se encuentran en las librerías estándar de Python (`scikit-learn`, `scipy`).
* **No Implementados:** Muchos métodos (ej. Bi-normal separation, TNoM, GRASP, VNS, Scatter Search, ACO, PSO, Algoritmos de Estimación de Distribución, etc.) son muy especializados, no están en `scikit-learn` y requerirían implementaciones personalizadas complejas o librerías de nicho. Se dejará constancia de ellos.


## 0. Importar Librerías

In [26]:
import pandas as pd
import numpy as np
import time
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')


# Filter Methods
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from scipy.stats import mannwhitneyu, kruskal

# Wrapper Methods
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score, make_scorer

# Models (Non-probabilistic)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## 1. Carga y Preparación de Datos

In [27]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder


TARGET_VARIABLE = 'recommendationClass'
COLUMNS_TO_DROP = [
    'recommendationKey', 'recommendationMean',
    'shortName', 'symbol', 'Ticker', 'fullExchangeName', 'twoHundredDayAverage',
]

df = pd.read_csv('../cleaned_main_financial_metrics.csv')

ALL_COLS = df.columns.tolist()
FEATURE_COLS = [col for col in ALL_COLS if col not in COLUMNS_TO_DROP and col != TARGET_VARIABLE]

NUMERIC_FEATURES = df[FEATURE_COLS].select_dtypes(include=['float64']).columns.tolist()
CATEGORICAL_FEATURES = df[FEATURE_COLS].select_dtypes(include=['object', 'bool', 'int32', 'int64']).columns.tolist()

existing_cols_to_drop = [col for col in COLUMNS_TO_DROP if col in df.columns]
df_processed = df.drop(columns=existing_cols_to_drop)

y = df_processed[TARGET_VARIABLE]
X = df_processed.drop(columns=[TARGET_VARIABLE])

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Clases del objetivo: {list(le.classes_)}")


numeric_features = X[NUMERIC_FEATURES].columns.tolist()
categorical_features = X[CATEGORICAL_FEATURES].columns.tolist()

print(f"Características numéricas ({len(numeric_features)}): {numeric_features}")
print(f"Características categóricas ({len(categorical_features)}): {categorical_features}")

for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])    


Clases del objetivo: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
Características numéricas (43): ['numberOfAnalystOpinions', 'currentPrice', 'allTimeHigh', 'allTimeLow', 'beta', 'averageVolume', 'marketCap', 'enterpriseValue', 'priceToBook', 'enterpriseToRevenue', 'profitMargins', 'grossMargins', 'ebitdaMargins', 'operatingMargins', 'returnOnAssets', 'returnOnEquity', 'totalRevenue', 'revenuePerShare', 'grossProfits', 'ebitda', 'netIncomeToCommon', 'trailingEps', 'totalCash', 'totalCashPerShare', 'totalDebt', 'quickRatio', 'currentRatio', 'bookValue', 'operatingCashflow', 'freeCashflow', 'trailingAnnualDividendYield', 'payoutRatio', 'sharesOutstanding', 'floatShares', 'sharesShort', 'sharesPercentSharesOut', 'shortRatio', 'shortPercentOfFloat', 'heldPercentInsiders', 'heldPercentInstitutions', 'fullTimeEmployees', '_debtToEquity', '_PER']
Características categóricas (6): ['sector', 'state', 'In_SP500', 'In_NASDAQ', 'is_Insolvent', 'has_benefits']


In [34]:
# --- Split train/test estratificado ---
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

train = X_train.copy()
train['target'] = y_train

test = X_test.copy()
test['target'] = y_test

train = train.drop(columns=['numberOfAnalystOpinions'])
test = test.drop(columns=['numberOfAnalystOpinions'])
full_dataset = pd.concat([train, test], axis=0)

train.to_csv('train_main_financial_metrics.csv', index=False)
test.to_csv('test_main_financial_metrics.csv', index=False)
full_dataset.to_csv('full_main_financial_metrics.csv', index=False)

In [29]:
# # --- Seleccionar columnas categóricas para One-Hot ---
# encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

# cat_cols = [
#     col for col in df[FEATURE_COLS].select_dtypes(include=['object', 'int32', 'int64']).columns
#     if col in X_train.columns and (
#         X_train[col].nunique() > 2 or
#         (X_train[col].nunique() == 2 and set(X_train[col].dropna().unique()) != {0, 1})
#     )
# ]

# binary_cat_cols = [col for col in X_train.columns if col not in cat_cols and X_train[col].nunique() == 2]

# # --- One-Hot encode -----------------------------------------------------
# X_train_encoded_cat = pd.DataFrame(
#     encoder.fit_transform(X_train[cat_cols]),
#     columns=encoder.get_feature_names_out(cat_cols),
#     index=X_train.index
# )

# X_test_encoded_cat = pd.DataFrame(
#     encoder.transform(X_test[cat_cols]),
#     columns=encoder.get_feature_names_out(cat_cols),
#     index=X_test.index
# )

# processed_cat_columns = X_train_encoded_cat.columns.tolist() + binary_cat_cols

# X_train_categorical = pd.concat([X_train[binary_cat_cols], X_train_encoded_cat], axis=1)
# X_test_categorical = pd.concat([X_test[binary_cat_cols], X_test_encoded_cat], axis=1)

processed_cat_columns = CATEGORICAL_FEATURES.copy()

X_train_categorical = X_train[processed_cat_columns].copy()
X_test_categorical = X_test[processed_cat_columns].copy()



In [30]:
# --- STANDARD SCALER -----------------------------------------------------
scaler_std = StandardScaler()
X_train_scaled_num = pd.DataFrame(
    scaler_std.fit_transform(X_train[numeric_features]),
    columns=numeric_features,
    index=X_train.index
)
X_test_scaled_num = pd.DataFrame(
    scaler_std.transform(X_test[numeric_features]),
    columns=numeric_features,
    index=X_test.index
)

# Combinar numéricas, binarias y One-Hot
X_train_std_final = pd.concat([X_train_scaled_num, X_train_categorical], axis=1)
X_test_std_final = pd.concat([X_test_scaled_num, X_test_categorical], axis=1)

# Añadir target
train_std_final = X_train_std_final.copy()
test_std_final = X_test_std_final.copy()
train_std_final["target"] = y_train
test_std_final["target"] = y_test

# Guardar CSV StandardScaler
train_std_final.to_csv("train_std_main_financial_metrics.csv", index=False)
test_std_final.to_csv("test_std_main_financial_metrics.csv", index=False)

print("✅ Archivos guardados con StandardScaler")
print("train_std_main_financial_metrics.csv")
print("test_std_main_financial_metrics.csv")

# --- MIN-MAX SCALER ------------------------------------------------------
scaler_minmax = MinMaxScaler()
X_train_minmax_num = pd.DataFrame(
    scaler_minmax.fit_transform(X_train[numeric_features]),
    columns=numeric_features,
    index=X_train.index
)

X_test_minmax_num = pd.DataFrame(
    scaler_minmax.transform(X_test[numeric_features]),
    columns=numeric_features,
    index=X_test.index
)

# Combinar numéricas, binarias y One-Hot
X_train_minmax_final = pd.concat([X_train_minmax_num, X_train_categorical], axis=1)
X_test_minmax_final = pd.concat([X_test_minmax_num, X_test_categorical], axis=1)

# Añadir target
train_minmax_final = X_train_minmax_final.copy()
test_minmax_final = X_test_minmax_final.copy()
train_minmax_final["target"] = y_train
test_minmax_final["target"] = y_test

# Guardar CSV MinMaxScaler
train_minmax_final.to_csv("train_minmax_main_financial_metrics.csv", index=False)
test_minmax_final.to_csv("test_minmax_main_financial_metrics.csv", index=False)

print("✅ Archivos guardados con MinMaxScaler")
print("train_minmax_main_financial_metrics.csv")
print("test_minmax_main_financial_metrics.csv")

✅ Archivos guardados con StandardScaler
train_std_main_financial_metrics.csv
test_std_main_financial_metrics.csv
✅ Archivos guardados con MinMaxScaler
train_minmax_main_financial_metrics.csv
test_minmax_main_financial_metrics.csv


In [31]:
# from imblearn.over_sampling import SMOTE

# # --- SMOTE sobre train (solo StandardScaler como ejemplo) ---
# smote = SMOTE(random_state=42)
# X_train_balanced, y_train_balanced = smote.fit_resample(
#     train_std_final.drop(columns=["target"]),
#     train_std_final["target"]
# )

# train_balanced_df = X_train_balanced.copy()
# train_balanced_df["target"] = y_train_balanced

# # Guardar CSV reequilibrado
# display(train_balanced_df["target"].value_counts())
# train_balanced_df.to_csv("train_std_balanced.csv", index=False)
# print("✅ Train reequilibrado con SMOTE guardado: train_std_balanced.csv")

In [32]:
# Definir conjuntos finales para modelado. Esta vez usaré std scaler
X_train = X_train_std_final.copy()
X_test = X_test_std_final.copy()
y_train = train_std_final['target'].copy()
y_test = test_std_final['target'].copy()


# Eliminar variables del namespace
del X_train_std_final, X_test_std_final
del X_train_minmax_final, X_test_minmax_final
del train_std_final, test_std_final
del train_minmax_final, test_minmax_final

# Forzar recolección de basura
import gc
gc.collect()


783