# Forward selection using Logistic regression

## Representation

In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y, single_feature= False):
        n_samples, n_features = X.shape 

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
            y_predicted = self._sigmoid(linear_model)

            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))



In [3]:
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

In [4]:
# Load csv file 
df = pd.read_csv('../datasets/mushrooms.csv', index_col=False)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
df.shape

(8124, 23)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [7]:
from sklearn.preprocessing import LabelEncoder
def label_encoded(feat):
    le = LabelEncoder()
    le.fit(feat)
    print(feat.name,le.classes_)
#     print(le.classes_)
    return le.transform(feat)

In [8]:
for col in df.columns:
    df[str(col)] = label_encoded(df[str(col)])

class ['e' 'p']
cap-shape ['b' 'c' 'f' 'k' 's' 'x']
cap-surface ['f' 'g' 's' 'y']
cap-color ['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']
bruises ['f' 't']
odor ['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']
gill-attachment ['a' 'f']
gill-spacing ['c' 'w']
gill-size ['b' 'n']
gill-color ['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
stalk-shape ['e' 't']
stalk-root ['?' 'b' 'c' 'e' 'r']
stalk-surface-above-ring ['f' 'k' 's' 'y']
stalk-surface-below-ring ['f' 'k' 's' 'y']
stalk-color-above-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
stalk-color-below-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
veil-type ['p']
veil-color ['n' 'o' 'w' 'y']
ring-number ['n' 'o' 't']
ring-type ['e' 'f' 'l' 'n' 'p']
spore-print-color ['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']
population ['a' 'c' 'n' 's' 'v' 'y']
habitat ['d' 'g' 'l' 'm' 'p' 'u' 'w']


In [9]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [10]:
X = df.drop(['class'],axis=1)
y = df['class']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=3, stratify=y_train)

# Optimisation

## Hyperparameter tuning

In [12]:
learning_rates=[0.1,0.01,0.001,0.0001]

In [13]:
def hypTune(X_train, y_train, X_val, y_val):
    scores = []
    for lr in learning_rates:
        logReg = LogisticRegression(lr)
        logReg.fit(X_train, y_train)
        predict = logReg.predict(X_val)
        acc = accuracy(y_val, predict)
        scores.append(acc)
    return scores

In [14]:
scores = hypTune(X_train, y_train, X_val, y_val)
scores

[0.9261862917398945,
 0.8708260105448155,
 0.8005272407732865,
 0.7513181019332161]

## Forward selection

### Psuedocode

1. Start with empty SF set $A = \emptyset$
2. Initialise candidate set to be all original attributes $A_c = A$
3. Find attribute $a_i$ with highest filter score
4. Remove feature $a_i$ from candidate set: $A_c \gets A \cap a_i$
5. Add feature $a_i$ to SF set:$A_s \gets A_s \cup a_i$
6. Repeat steps 3-5 until convergence

Filter types:
- Correlation
- Mutual information
- Entropy
- Classification rate
- Regression score

In [43]:
def plot_feature_importance_logistic_regression(X, y, feature_names=None):
    """
    Compute and plot feature importance using Logistic Regression coefficients.
    
    Parameters:
    - X: Input feature matrix (numpy array or pandas DataFrame).
    - y: Target variable (numpy array or pandas Series).
    - feature_names: List of feature names (optional).
    
    Returns:
    - feature_importance: Array of feature importances (absolute coefficients).
    """
    model = LogisticRegression()
    model.fit(X, y)
    
    feature_importance = np.abs(model.coef_[0])  # Take the absolute values of coefficients
    
    if feature_names is None:
        feature_names = [f"Feature {i}" for i in range(X.shape[1])]
    
    # Sort feature importance in descending order
    sorted_idx = np.argsort(feature_importance)[::-1]
    feature_importance = feature_importance[sorted_idx]
    feature_names = [feature_names[i] for i in sorted_idx]
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(feature_importance)), feature_importance, align='center')
    plt.xticks(range(len(feature_importance)), feature_names, rotation=90)
    plt.xlabel('Feature')
    plt.ylabel('Feature Importance (Absolute Coefficient)')
    plt.title('Feature Importance (Logistic Regression)')
    plt.show()
    
    return feature_importance

# Example usage:
# Assuming you have X (feature matrix) and y (target variable)
# feature_importance = plot_feature_importance_logistic_regression(X, y, feature_names=X.columns)


In [44]:
col_names = X_train.columns
col_names

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [45]:
important_features = plot_feature_importance_logistic_regression(X_train, y_train,col_names )

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [41]:
def backward_selection(X, y, model, stopping_criterion=0.01):
    """
    Perform backward feature selection and return selected features.
    
    Parameters:
    - X: Input feature matrix (numpy array or pandas DataFrame).
    - y: Target variable (numpy array or pandas Series).
    - model: Machine learning model with a fit and predict method (e.g., sklearn classifier).
    - stopping_criterion: The tolerance for feature removal. The process stops when accuracy drops below this threshold.
    
    Returns:
    - selected_features: List of selected feature indices.
    """
    num_features = X.shape[1]
    selected_features = list(range(num_features))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    base_accuracy = accuracy_score(y_test, model.predict(X_test))
    
    while len(selected_features) > 1:
        max_accuracy = -1
        feature_to_remove = None

        for feature_index in selected_features:
            reduced_features = selected_features.copy()
            reduced_features.remove(feature_index)
            X_train_reduced = X_train.iloc[:, reduced_features]
            X_test_reduced = X_test.iloc[:, reduced_features]

            model.fit(X_train_reduced, y_train)
            y_pred = model.predict(X_test_reduced)
            accuracy = accuracy_score(y_test, y_pred)
            print(accuracy)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                feature_to_remove = feature_index

        if (base_accuracy - max_accuracy) < stopping_criterion:
            break

        selected_features.remove(feature_to_remove)
        base_accuracy = max_accuracy

    return selected_features

# Example usage:
# Assuming you have X (feature matrix) and y (target variable)
# selected_features = backward_selection(X, y, YourClassifier())


In [42]:
LG_model = LogisticRegression(0.1)
selected_fetaures = backward_selection(X_train,y_train, LG_model, 0.9)
selected_fetaures

0.9285714285714286
0.9285714285714286
0.9296703296703297
0.9263736263736264
0.9318681318681319
0.9296703296703297
0.921978021978022
0.8879120879120879
0.9296703296703297
0.9252747252747253
0.9230769230769231
0.9208791208791208
0.9296703296703297
0.9296703296703297
0.9296703296703297
0.9296703296703297
0.9241758241758242
0.9296703296703297
0.9285714285714286
0.9263736263736264
0.9351648351648352
0.9263736263736264


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [31]:
def forward_selection(X, y, model, test_size=0.2, min_features=2):
    num_features = X.shape[1]
    selected_features = []
    best_accuracy = 0.0
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    while len(selected_features) < num_features:
        best_feature = None
        best_feature_index = None
        for feature in range(num_features):
            if feature in selected_features:
                continue
                
            candidate_features = selected_features + [feature]
            X_train_subset = X_train.iloc[candidate_features]
            X_test_subset = X_test.iloc[candidate_features]
            
            model.fit(X_train_subset, y_train)
            y_pred = model.predict(X_test_subset)
            accuracy = accuracy_score(y_test, y_pred)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature
                best_feature_index = candidate_features
        
        if best_feature is not None:
            selected_features.append(best_feature)
        else:
            break
    
    if len(selected_features) < min_features:
        print("Warning: Minimum number of features not met. Returning top features by importance.")
        selected_features = np.argsort(-model.coef_[0])[:min_features]
    
    selected_features = np.array(selected_features)  # Convert to NumPy array
    
    return selected_features

In [32]:
LG_model = LogisticRegression(0.1)
selected_fetaures = forward_selection(X_train,y_train, LG_model, 0.2)
selected_fetaures

ValueError: shapes (22,1) and (3638,) not aligned: 1 (dim 1) != 3638 (dim 0)

# Evaluation

In [None]:
accuracy(y_test, predicted)

In [None]:
# We use confusion matrix (TP, TN, FP, FN) to visualise the performance

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, predicted)

In [None]:
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(3,3))
ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i,j], va='center', ha='center')
classes=["Edible", "poisoness"]
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.xlabel('Predicted Values',)
plt.ylabel('actual Values',);
print(classification_report(y_test, predicted))