In [279]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [280]:
iris = load_iris()
df_x=pd.DataFrame(data=iris['data'],columns=iris['feature_names'])
df_y=pd.DataFrame(data=iris['target'],columns=['target'])

In [281]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [282]:
df_x.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [283]:
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)
y_train

Unnamed: 0,target
22,0
15,0
65,1
11,0
42,0
...,...
71,1
106,2
14,0
92,1


In [284]:
print(X_train.shape,X_test.shape,y_test.shape)


(120, 4) (30, 4) (30, 1)


#***Branch and Bound:***

To implement this algorihm we're using a simple random forsets model on iris data set

The goal is to find the most optimal subset of features to get the best accuracy from the test set

In [285]:
#simple objective function
def objective_function(selected_features):
    # Train a model using the selected features
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train[ selected_features], y_train.values.ravel())

    # Evaluation
    y_pred = clf.predict(X_test[ selected_features])

    return accuracy_score(y_test, y_pred)

In [286]:
#  # Initialize Variables for the best performance
best_solution = None
best_performance = 0


In [287]:
active_set = [[iris['feature_names'][0]]]

In [288]:
len(active_set)

1

In [289]:
X_train[ active_set[0]]

Unnamed: 0,sepal length (cm)
22,4.6
15,5.7
65,6.7
11,4.8
42,4.4
...,...
71,6.1
106,4.9
14,5.8
92,5.8


In [290]:
# Branch and Bound Algorithm
while (len(active_set)>0):
    # Choose a solution from the active set
    current_solution = active_set.pop()
    # Branching
    for next_feature in iris['feature_names']:
        if next_feature not in current_solution:
            new_subset = current_solution+[next_feature]
            # Bounding:
            current_performance = objective_function(current_solution)
            new_performance = objective_function(new_subset)
            print(f'current set of features: {current_solution}\t current performance: {current_performance}\nnew set of features: {new_subset}\t new performance: {new_performance} \n\n---------------------------------------------------------------------------\n')
            if new_performance > best_performance:
                # Update the best solution
                best_solution = new_subset
                best_performance = new_performance
            else:
                active_set.append(new_subset)


current set of features: ['sepal length (cm)']	 current performance: 0.7666666666666667
new set of features: ['sepal length (cm)', 'sepal width (cm)']	 new performance: 0.7666666666666667 

---------------------------------------------------------------------------

current set of features: ['sepal length (cm)']	 current performance: 0.7666666666666667
new set of features: ['sepal length (cm)', 'petal length (cm)']	 new performance: 1.0 

---------------------------------------------------------------------------

current set of features: ['sepal length (cm)']	 current performance: 0.7666666666666667
new set of features: ['sepal length (cm)', 'petal width (cm)']	 new performance: 0.9666666666666667 

---------------------------------------------------------------------------

current set of features: ['sepal length (cm)', 'petal width (cm)']	 current performance: 0.9666666666666667
new set of features: ['sepal length (cm)', 'petal width (cm)', 'sepal width (cm)']	 new performance: 0.96

In [291]:
print("Best Feature Subset:", best_solution)
print("Best Performance:", best_performance)
#this is returning the first best solution

Best Feature Subset: ['sepal length (cm)', 'petal length (cm)']
Best Performance: 1.0


#***Greedy Forward Selection "heuristic Algo:***

In [306]:
def objective_function(selected_features):
    # Train a model using the selected features
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train[selected_features], y_train.values.ravel())

    # Make predictions on the test set
    y_pred = clf.predict(X_test[selected_features])

    # Return the accuracy
    return accuracy_score(y_test, y_pred)

In [311]:
selected_features = []  # Initialize with an empty set
num_features = X_train.shape[1]

In [312]:
while len(selected_features) < num_features:
  best_feature = None
  best_performance = 0  # Initialize with a small value

  # Consider adding each feature not in the current set
  for feature in iris['feature_names']:
      if feature not in selected_features:
          current_subset = selected_features + [feature]
          current_performance = objective_function(current_subset)
          print(f'Selected features: {selected_features}\nBest feature: {best_feature} \t Best performance: {best_performance}\nfeature being tested: {feature}\ncurrent set of features: {current_subset} \t current performance: {current_performance}\n\n---------------------------------------------------------------------------\n')

          if current_performance > best_performance:
              best_feature = feature
              best_performance = current_performance

  # Add the best feature to the selected set
  selected_features.append(best_feature)



Selected features: []
Best feature: None 	 Best performance: 0
feature being tested: sepal length (cm)
current set of features: ['sepal length (cm)'] 	 current performance: 0.7666666666666667

---------------------------------------------------------------------------

Selected features: []
Best feature: sepal length (cm) 	 Best performance: 0.7666666666666667
feature being tested: sepal width (cm)
current set of features: ['sepal width (cm)'] 	 current performance: 0.5333333333333333

---------------------------------------------------------------------------

Selected features: []
Best feature: sepal length (cm) 	 Best performance: 0.7666666666666667
feature being tested: petal length (cm)
current set of features: ['petal length (cm)'] 	 current performance: 0.9333333333333333

---------------------------------------------------------------------------

Selected features: []
Best feature: petal length (cm) 	 Best performance: 0.9333333333333333
feature being tested: petal width (cm)


In [313]:
print("Selected Features:", selected_features)
print("Performance:", objective_function(selected_features))

Selected Features: ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']
Performance: 1.0


#***Ant Colony Optimisation:***

try with a duffrent data set iris is too simple to show the work of the algo

In [347]:
num_ants = 10
evaporation_rate = 0.5
alpha = 1.0
beta = 2.0
num_iterations = 10#50
best_features = None
best_accuracy = 0.0

In [348]:
num_features = X_train.shape[1]
pheromone_levels = np.ones(num_features)

In [349]:
def feature_slection(alpha, pheromone_levels):
  features_selected = []
  remaining_features = set(range(len(pheromone_levels)))

  while remaining_features:
      probabilities = pheromone_levels[list(remaining_features)] **alpha
      probabilities /= probabilities.sum()

      selected_feature = np.random.choice(list(remaining_features), p=probabilities)
      features_selected.append(selected_feature)
      remaining_features.remove(selected_feature)

  return features_selected

In [350]:
#simulate ant movement
num_features = X_train.shape[1]
pheromone_levels = np.ones(num_features)

for iteration in range(num_iterations):
    ant_paths = []
    print(f'Iteration number {iteration}')
    for ant in range(num_ants):
        features_selected = feature_slection(alpha,pheromone_levels)

        ant_paths.append(features_selected)

        features_names=[(iris['feature_names'])[i] for i in features_selected]


        # Evaluate solution
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train.iloc[:,features_selected], y_train.values.ravel())
        y_pred = clf.predict(X_test.iloc[:,features_selected])
        accuracy = accuracy_score(y_test, y_pred)
        print(f'\t\tPaths:{ant_paths} \t Selected features to be tested: {features_names} \t accuracy: {accuracy} \n\t\t---------------------------------------')

        # Update pheromone levels
        pheromone_levels *= (1 - evaporation_rate)
        pheromone_levels[features_selected] += accuracy

        # Update global best solution
        if accuracy > best_accuracy:
            best_features = features_selected
            best_accuracy = accuracy
            print(f'\t\t!!!!!!best features:{best_features} \t accuracy: {best_accuracy} \n---------------------------------------')



Iteration number 0
		Paths:[[0, 3, 1, 2]] 	 Selected features to be tested: ['sepal length (cm)', 'petal width (cm)', 'sepal width (cm)', 'petal length (cm)'] 	 accuracy: 1.0 
		---------------------------------------
		!!!!!!best features:[0, 3, 1, 2] 	 accuracy: 1.0 
---------------------------------------
		Paths:[[0, 3, 1, 2], [0, 3, 1, 2]] 	 Selected features to be tested: ['sepal length (cm)', 'petal width (cm)', 'sepal width (cm)', 'petal length (cm)'] 	 accuracy: 1.0 
		---------------------------------------
		Paths:[[0, 3, 1, 2], [0, 3, 1, 2], [0, 3, 1, 2]] 	 Selected features to be tested: ['sepal length (cm)', 'petal width (cm)', 'sepal width (cm)', 'petal length (cm)'] 	 accuracy: 1.0 
		---------------------------------------
		Paths:[[0, 3, 1, 2], [0, 3, 1, 2], [0, 3, 1, 2], [3, 0, 1, 2]] 	 Selected features to be tested: ['petal width (cm)', 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'] 	 accuracy: 1.0 
		---------------------------------------
		Paths:[

In [346]:
best_features_names=[(iris['feature_names'])[i] for i in best_features]
print("Best Feature Subset:", best_features_names)
print("Best Accuracy:", best_accuracy)

Best Feature Subset: ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']
Best Accuracy: 1.0
