In [2]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# define dataset
X, y = make_classification(
    n_samples=1000,
    n_features=3,
    n_informative=2,
    n_redundant=1,
    random_state=1
)

# define model
model = DecisionTreeClassifier()

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.808 (0.031)


In [5]:
from itertools import product
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# Definir el dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=2,
                           n_redundant=3, random_state=1)

# Determinar el número de columnas (features)
n_cols = X.shape[1]
best_subset, best_score = None, 0.0

# Enumerar todas las combinaciones posibles de features
for subset in product([True, False], repeat=n_cols):
    # Convertir la combinación booleana a índices de columnas
    ix = [i for i, x in enumerate(subset) if x]

    # Omitir combinaciones sin columnas seleccionadas
    if len(ix) == 0:
        continue

    # Seleccionar columnas
    X_new = X[:, ix]

    # Definir el modelo
    model = DecisionTreeClassifier()

    # Definir el procedimiento de evaluación
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    # Evaluar el modelo
    scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=cv, n_jobs=-1)

    # Calcular el promedio de los scores
    result = mean(scores)

    # Mostrar progreso
    print('>f(%s) = %f' % (ix, result))

    # Verificar si es el mejor resultado hasta ahora
    if best_score is None or result >= best_score:
        best_subset, best_score = ix, result

# Reportar el mejor subconjunto
print('Done!')
print('f(%s) = %f' % (best_subset, best_score))

[0, 1, 2, 3, 4]
False

>f([0, 1, 2, 3, 4]) = 0.819667
[0, 1, 2, 3]
False

>f([0, 1, 2, 3]) = 0.820667
[0, 1, 2, 4]
False

>f([0, 1, 2, 4]) = 0.813000
[0, 1, 2]
False

>f([0, 1, 2]) = 0.824333
[0, 1, 3, 4]
False

>f([0, 1, 3, 4]) = 0.821000
[0, 1, 3]
False

>f([0, 1, 3]) = 0.824667
[0, 1, 4]
False

>f([0, 1, 4]) = 0.806333
[0, 1]
False

>f([0, 1]) = 0.821667
[0, 2, 3, 4]
False

>f([0, 2, 3, 4]) = 0.826667
[0, 2, 3]
False

>f([0, 2, 3]) = 0.822667
[0, 2, 4]
False

>f([0, 2, 4]) = 0.825333
[0, 2]
False

>f([0, 2]) = 0.813667
[0, 3, 4]
False

>f([0, 3, 4]) = 0.828000
[0, 3]
False

>f([0, 3]) = 0.817333
[0, 4]
False

>f([0, 4]) = 0.813333
[0]
False

>f([0]) = 0.639333
[1, 2, 3, 4]
False

>f([1, 2, 3, 4]) = 0.825333
[1, 2, 3]
False

>f([1, 2, 3]) = 0.821000
[1, 2, 4]
False

>f([1, 2, 4]) = 0.826667
[1, 2]
False

>f([1, 2]) = 0.821667
[1, 3, 4]
False

>f([1, 3, 4]) = 0.820000
[1, 3]
False

>f([1, 3]) = 0.822667
[1, 4]
False

>f([1, 4]) = 0.808000
[1]
False

>f([1]) = 0.797000
[2, 3, 4]
False


In [8]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# Definir dataset con muchas características (500)
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10,
                           n_redundant=490, random_state=1)

# Definir modelo
model = DecisionTreeClassifier()

# Definir procedimiento de evaluación (validación cruzada estratificada)
cv = StratifiedKFold(n_splits=3)

# Evaluar el modelo
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Reportar resultados
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Mean Accuracy: 0.913 (0.001)


In [14]:
from numpy import mean
from numpy.random import rand, choice
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

# Función objetivo que evalúa un subconjunto de características
def objective(X, y, subset):
    # Convertir el subset booleano en índices
    ix = [i for i, x in enumerate(subset) if x]

    # Si no hay columnas seleccionadas, retornar score 0.0
    if len(ix) == 0:
        return 0.0, []

    # Seleccionar las columnas activas
    X_new = X[:, ix]

    # Definir el modelo
    model = DecisionTreeClassifier()

    # Evaluar el modelo con validación cruzada
    scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=3, n_jobs=-1)

    # Calcular media del score
    result = mean(scores)

    return result, ix

# Operador de mutación
def mutate(solution, p_mutate):
    # Copiar la solución original
    child = solution.copy()

    # Aplicar mutación con probabilidad p_mutate a cada bit
    for i in range(len(child)):
        if rand() < p_mutate:
            child[i] = not child[i]
    
    return child

# Algoritmo de búsqueda local tipo Hill Climbing
def hillclimbing(X, y, objective, n_iter, p_mutate):
    # Generar punto inicial aleatorio
    solution = choice([True, False], size=X.shape[1])

    # Evaluar punto inicial
    solution_eval, ix = objective(X, y, solution)

    # Iterar
    for i in range(n_iter):
        # Generar vecino mutado
        candidate = mutate(solution, p_mutate)

        # Evaluar vecino
        candidate_eval, ix = objective(X, y, candidate)

        # Reemplazar si es mejor o igual
        if candidate_eval >= solution_eval:
            solution, solution_eval = candidate, candidate_eval

        # Mostrar progreso
        print('>%d f(%d) = %f' % (i+1, len(ix), solution_eval))

    return solution, solution_eval

# Definir dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10,
                           n_redundant=490, random_state=1)

# Definir número de iteraciones
n_iter = 100

# Probabilidad de mutación por columna (10 de 500)
p_mut = 10.0 / 500.0

# Ejecutar búsqueda Hill Climbing
subset, score = hillclimbing(X, y, objective, n_iter, p_mut)

# Obtener índices seleccionados
ix = [i for i, x in enumerate(subset) if x]

# Mostrar resultado final
print('Done!')
print('Best: f(%d) = %f' % (len(ix), score))

>1 f(276) = 0.902100
>2 f(276) = 0.902100
>3 f(277) = 0.902100
>4 f(274) = 0.902100
>5 f(275) = 0.902100
>6 f(276) = 0.902100
>7 f(272) = 0.902100
>8 f(274) = 0.902100
>9 f(275) = 0.902100
>10 f(273) = 0.902100
>11 f(272) = 0.902100
>12 f(277) = 0.902100
>13 f(274) = 0.902100
>14 f(276) = 0.902100
>15 f(273) = 0.902500
>16 f(273) = 0.902500
>17 f(272) = 0.902500
>18 f(274) = 0.904201
>19 f(267) = 0.904700
>20 f(266) = 0.906300
>21 f(264) = 0.911300
>22 f(263) = 0.911300
>23 f(260) = 0.911300
>24 f(264) = 0.911400
>25 f(260) = 0.911400
>26 f(270) = 0.911400
>27 f(265) = 0.911400
>28 f(265) = 0.911400
>29 f(260) = 0.911400
>30 f(266) = 0.911400
>31 f(266) = 0.911400
>32 f(263) = 0.911400
>33 f(269) = 0.911400
>34 f(264) = 0.911400
>35 f(266) = 0.911400
>36 f(262) = 0.911400
>37 f(267) = 0.911400
>38 f(268) = 0.911400
>39 f(262) = 0.911400
>40 f(262) = 0.911400
>41 f(262) = 0.911400
>42 f(263) = 0.911400
>43 f(263) = 0.911400
>44 f(265) = 0.911400
>45 f(259) = 0.911400
>46 f(257) = 0.9114