In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

data = pd.read_csv('Datasets/Iris.csv')
data = data.drop('Id', axis=1)
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
import time

# Forward Selection Algorithm from scratch (misclassification error as metric for selection)
def forward_selction(data, target, model):
    features = data.drop(target, axis=1)
    features = features.columns
    selected_features = []
    while len(features) > 0:
        remaining_features = list(set(features) - set(selected_features))
        best_feature = None
        best_accuracy = 0
        if(len(remaining_features) > 0):
            print(f"Round {len(selected_features)}")
        for feature in remaining_features:
            X = data[selected_features + [feature]]
            y = data[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            if score > best_accuracy:
                best_feature = feature
                best_accuracy = score

        if best_accuracy < 0.9:
            break

        selected_features.append(best_feature)
        print("Best Accuracy : ", best_accuracy)
        print("Selected Feature : ", selected_features)
        print()
    return selected_features

model = KNeighborsClassifier(n_neighbors=3)

start_time = time.time()
selected_features = forward_selction(data, "Species", model)
end_time = time.time()

print("Selected features:", selected_features)
print("Time taken for forward selection: {:.2f} seconds".format(end_time - start_time))

Round 0
Best Accuracy :  1.0
Selected Feature :  ['PetalWidthCm']

Round 1
Best Accuracy :  1.0
Selected Feature :  ['PetalWidthCm', 'SepalWidthCm']

Round 2
Best Accuracy :  1.0
Selected Feature :  ['PetalWidthCm', 'SepalWidthCm', 'SepalLengthCm']

Round 3
Best Accuracy :  1.0
Selected Feature :  ['PetalWidthCm', 'SepalWidthCm', 'SepalLengthCm', 'PetalLengthCm']

Selected features: ['PetalWidthCm', 'SepalWidthCm', 'SepalLengthCm', 'PetalLengthCm']
Time taken for forward selection: 0.05 seconds


In [23]:
def backward_selection(data , target , model):
    features = list(data.columns)
    features.remove(target)
    selected_features = features
    while selected_features and len(selected_features) > 0:
        best_accuracy = 0
        worst_feature = None
        for feature in selected_features:
            temp_features = selected_features.copy()
            temp_features.remove(feature)
            X = data[temp_features]
            y = data[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            if score > best_accuracy:
                best_accuracy = score
                worst_feature = feature

        if best_accuracy > 0.9:
            break

        selected_features.remove(worst_feature)
        print("Best Accuracy : ", best_accuracy)
        print("Removed Feature : ", worst_feature)
        print()
    return selected_features

model = KNeighborsClassifier(n_neighbors=3)

start_time = time.time()
selected_features = backward_selection(data, "Species", model)
end_time = time.time()

print("Selected features:", selected_features)
print("Time taken for backward selection: {:.2f} seconds".format(end_time - start_time))

Selected features: ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
Time taken for backward selection: 0.03 seconds
