# Lab 5
Prepared:
 - **Szymon Budziak**
 - **Krzysztof Gwiazda**

In [1]:
import numpy as np

from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import scipy

## 1. Load dataset

In [2]:
data = scipy.io.loadmat("leukemia.mat")
X = data['X']
y = data['Y'].ravel()

## 2. Remove features characterized by low variance

In [3]:
threshold = 0.1
variance_filter = VarianceThreshold(threshold=threshold)
X_filtered = variance_filter.fit_transform(X)

In [4]:
original_accuracy = np.mean(cross_val_score(LogisticRegression(), X, y, cv=5))
filtered_accuracy = np.mean(cross_val_score(LogisticRegression(), X_filtered, y, cv=5))
print("Accuracy - Original:", original_accuracy)
print("Accuracy - Filtered:", filtered_accuracy)

Accuracy - Original: 0.9285714285714286
Accuracy - Filtered: 0.9285714285714286


## 3. Select the top 'm' features

In [5]:
m = X.shape[0]//4
estimator = LogisticRegression()  
rfe_selector = RFE(estimator, n_features_to_select=int(m))
X_rfe = rfe_selector.fit_transform(X, y)

## 4. Examine the impact of accuracy and AUC metric on classification accuracy

In [6]:
scoring_metrics = ['accuracy', 'roc_auc']
cv = 6
scores = {}

for metric in scoring_metrics:
    logistic_reg = LogisticRegression()
    random_forest = RandomForestClassifier()

    logistic_scores = cross_val_score(logistic_reg, X_rfe, y, cv=cv, scoring=metric)
    random_forest_scores = cross_val_score(random_forest, X_rfe, y, cv=cv, scoring=metric)

    scores[metric] = {'Logistic Regression': np.mean(logistic_scores),
                      'Random Forest': np.mean(random_forest_scores)}

for metric, values in scores.items():
    print(f"{metric.upper()}:")
    for model, score in values.items():
        print(f"{model}: {score:.4f}")
    print()

ACCURACY:
Logistic Regression: 1.0000
Random Forest: 1.0000

ROC_AUC:
Logistic Regression: 1.0000
Random Forest: 1.0000



## 5. Compare the effectiveness of the applied approach with built-in feature selection methods

In [7]:
l1_logistic = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'))
l1_logistic.fit(X, y)
X_l1_logistic = l1_logistic.transform(X)

feature_importances_rf = SelectFromModel(RandomForestClassifier())
feature_importances_rf.fit(X, y)
X_rf = feature_importances_rf.transform(X)

print("Number of features selected using L1 regularization:", X_l1_logistic.shape[1])
print("Number of features selected using feature importances:", X_rf.shape[1])

Number of features selected using L1 regularization: 32
Number of features selected using feature importances: 476
