In [29]:
# feature selection examples 
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import (ensemble, model_selection, preprocessing, tree)
from sklearn.metrics import (confusion_matrix, accuracy_score, classification_report)
from sklearn.model_selection import (train_test_split)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline

In [None]:
## load breast cancer dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns = data.feature_names)

In [4]:
y = data.target

In [7]:
# y = df['target']
X = df

In [8]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# let's do some PCA before piping it into a classification problem 
pca = PCA(random_state=42)
X_pca = pca.fit_transform(StandardScaler().fit_transform(X))

In [None]:
pca.explained_variance_ratio_

In [None]:
# scree plot, elbow indicates 6 components may explain the majority of the variance
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(pca.explained_variance_ratio_)
ax.set(
    xlabel = "Component",
    ylabel = "Percent of explained variance",
    title = "Scree plot",
    ylim = (0, 0.5)
)

In [None]:
# 6 or 7 components may explain 90% of the variance 
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(
    np.cumsum(pca.explained_variance_ratio_)
)
ax.set(
    xlabel="Component",
    ylabel="Percent of Explained variance",
    title="Cumulative Variance",
    ylim=(0, 1),
)

In [None]:
# this mapping is a little bit odd
fig, ax = plt.subplots(figsize=(6, 4))
plt.imshow(
    pca.components_[0:8].T,
    cmap="Spectral",
    vmin=-1,
    vmax=1,
)
plt.yticks(range(len(X.columns)), X.columns)
plt.xticks(range(8), range(1, 9))
plt.xlabel("Principal Component")
plt.ylabel("Contribution")
plt.title(
    "Contribution of Features to Components"
)
plt.colorbar()

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
pd.DataFrame(
    pca.components_[0:2], columns=X.columns
).plot(kind="bar", ax=ax).legend(
    bbox_to_anchor=(1, 1)
)

In [None]:
# pipe scaler, pca, logistic regression together
# from PCA we know probably, 6 or 7 components are enough for preparing the input for classification

In [32]:
clf6 = Pipeline([
    ('Scale', StandardScaler()),
    ('pca', PCA(n_components = 6)),
    ('classifier', LogisticRegression())
])

In [None]:
clf6.fit(X_train, y_train)

In [None]:
# make predictions and check performance (f1-score is slightly better!)
y_pred = clf6.predict(X_test)
print(classification_report(y_test, y_pred))