In [1]:
# Data manipulation 
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Feature Selector
from sklearn.feature_selection import RFE

# ML models for voting
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# Final model for regression
from sklearn.linear_model import LinearRegression

In [2]:
# High dimensional data import
diabetes = pd.read_csv('~/DevSpace/Data-Snippets/MachineLearning/datasets/diabetes.csv')
diabetes.head()

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,diagnostic
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive
2,3,78,50,32,88,31.0,0.248,26,positive
3,2,197,70,45,543,30.5,0.158,53,positive
4,1,189,60,23,846,30.1,0.398,59,positive


In [3]:
# Select data
y = diabetes['bmi']
X = diabetes.drop(['diagnostic', 'bmi'], axis=1)

# Split train, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=12)

# Number of features in dataset
print(f'Number of dimensions (cols): {len(X.columns)}')

Number of dimensions (cols): 7


In [4]:
# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train, y_train)
print('Optimal alpha = {0:.3f}'.format(lcv.alpha_))

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print(f'The model explains {r_squared:.1%} of the test set variance')

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print(f'{sum(lcv_mask)} features out of {len(lcv_mask)} selected')

Optimal alpha = 1.517
The model explains 39.9% of the test set variance
5 features out of 7 selected


In [5]:
# Select 3 features with RFE on a GradientBoostingRegressor
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=5, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_
print(f'{sum(gb_mask)} features out of {len(gb_mask)} selected')

Fitting estimator with 7 features.
The model can explain 33.2% of the variance in the test set
5 features out of 7 selected


In [6]:
# Select 3 features with RFE on a RandomForestRegressor
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=5)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print('The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to gb_mask
rf_mask = rfe_rf.support_
print(f'{sum(rf_mask)} features out of {len(rf_mask)} selected')

The model can explain {r_squared:.1%} of the variance in the test set
5 features out of 7 selected


In [7]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by at least 2 models
meta_mask = votes >= 2

# Show Mask
print(dict(zip(X.columns, meta_mask)))

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]


{'pregnant': False, 'glucose': True, 'diastolic': True, 'triceps': True, 'insulin': True, 'family': False, 'age': False}


In [8]:
# Create model
lm = LinearRegression()

# Split data & Scaler
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
scaler = StandardScaler()

# Plug the reduced dataset into a linear regression pipeline
lm.fit(scaler.fit_transform(X_train), y_train)

# Measure accucary R^2
r_squared = lm.score(scaler.transform(X_test), y_test)

# Result
print(f'The model can explain {r_squared:.1%} of the variance in the test set using {len(lm.coef_)} features.')

The model can explain 47.0% of the variance in the test set using 4 features.
