In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Load the breast cancer dataset
cancer = load_breast_cancer()

# Create a DataFrame with the feature data
df = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names'])

# Create the target variable 'target'
df['target'] = cancer['target']

# Separate the features and the target variable
X = df.drop('target', axis=1)
y = df['target']

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Create a StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5)

# Initialize RFECV
rfecv = RFECV(estimator=rf_classifier, cv=cv, scoring='accuracy')

# Fit RFECV on the data
rfecv.fit(X, y)

# Print the optimal number of features
print("Optimal number of features: ", rfecv.n_features_)

# Print the selected features
selected_features = X.columns[rfecv.support_]
print("Selected features: ", selected_features)


Optimal number of features:  14
Selected features:  Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst compactness',
       'worst concavity', 'worst concave points'],
      dtype='object')
