In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.feature_selection import VarianceThreshold, f_regression, SelectKBest, RFE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
boston = datasets.load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['price'] = boston.target
df.head()

## Variance Threshold
$$Var(X) = E[(X - \mu)^2]$$

In [None]:
x1 = np.random.normal(0, 0.1, 1000)
x2 = np.random.normal(0, 0.5, 1000)
plt.hist(x1, alpha=0.5, label='Var(x) = 0.1')
_ = plt.hist(x2, alpha=0.5, label='Var(x) = 0.5')
plt.legend()

In [None]:
ax = df.var(0).plot(kind='bar', label='Variance')
ax.hlines(2000, 0, 13, label='Threshold')
plt.legend()

In [None]:
vt = VarianceThreshold(threshold=2000)
x_reduced = vt.fit_transform(df)
df.columns[vt.get_support()]

In [None]:
x_reduced

In [None]:
df.var(0)

In [None]:
X = boston.data
y = boston.target

thresholds = [0.01, 0.1, 10, 100, 1000]
r2 = []
n_cols = []

for t in thresholds:
    vt = VarianceThreshold(threshold=t)
    x_red = vt.fit_transform(X)
    n_cols.append(x_red.shape[1])
    lr = LinearRegression().fit(x_red, y)
    p = lr.predict(x_red)
    r2.append(r2_score(y, p))

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(thresholds, r2, 'ro--', label='R2')
ax.grid()
ax.set_xscale('log')
ax.set_xlabel('Threshold')
_ = ax.legend()

### Exercise: Perform variance thresholding on the Diabetes dataset
### *Hint*: Look at the variances beforehand and decide the set of thresholds

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
# enter code here

## Selecting the $k$ best features

In [None]:
k = 3
selector = SelectKBest(f_regression, k=k)
x_red = selector.fit_transform(X, y)
x_red.shape

In [None]:
lr.fit(x_red, y)
lr.score(x_red, y)

In [None]:
K = list(range(1, X.shape[1] + 1))
K

In [None]:
r2 = []

for k in K:
    selector = SelectKBest(f_regression, k=k)
    x_red = selector.fit_transform(X, y)
    lr = LinearRegression().fit(x_red, y)
    r2.append(lr.score(x_red, y))

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(K, r2, 'ro--', label='R2')
ax.grid()
ax.set_xlabel('Number of features')
_ = ax.legend()

### Exercise: Find how many "best" features are optimal for the Diabetes dataset

In [None]:
# enter code here

## Recursive Feature Elimination

In [None]:
columns = df.drop(['price'], axis=1).columns

lr = LinearRegression()
rfe = RFE(lr, step=1, verbose=1)
x_red = rfe.fit_transform(X, y)

lr.fit(x_red, y)
print(lr.score(x_red, y))

In [None]:
# Which columns are kept?
columns[rfe.get_support()]

In [None]:
rfe = RFE(lr, n_features_to_select=3, step=1, verbose=1)
x_red = rfe.fit_transform(X, y)

lr.fit(x_red, y)
print(lr.score(x_red, y))

## Exercise: Find the R2 score with the 3 best columns in the diabetes dataset.
### Which are these three columns?

In [None]:
# enter code here