# Feature Selection

In [2]:
from sklearn.model_selection import train_test_split

In [1]:
def data_split(df, target_var, size=0.3, state=0):
    """
    Given dataset is splitted into train and test set.
        Parameters:
            df (DataFrame): Any DataFrame
            target_var (str): Labeled feature for predicting/classifying. 
            size (float): Test size. Default value is 0.3.
            state (int): Random state for reproducibility purpose. 
        Returns:
            X_train, X_test, y_train, y_test (Array): Splitted X and y DataFrames for training/set sets. 
    """
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=[target_var], axis=1),
                                                        df[target_var], test_size= size, random_state=0)
    
    # Data Shape
    print(f'X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test, y_train, y_test

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = data_split(df, target_var, size=0.3, state=0)

## 1. Filter Methods

### 1.a. Basic Filtering

#### 1.a.1 Remove Constant Features

 Constant features are those that show the same value, just one value, for all the observations of the dataset. This is the same value for all the rows of the dataset. These features provide no information that allows a machine learning model to discriminate or predict a target.
 
To identify constant features, we can use the VarianceThreshold function from Sklearn or write a snippet code for finding them.  Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold

In [5]:
def remove_constants(X_train, X_test):
    """
    Removes constant features in the dataframe. 
        Parameters:
            X_train (DataFrame): Original Train Set 
            X_test (DataFrame) : Original Test Set
        Returns: 
            X_train, X_test (DataFrame): Train/Test Sets after removing constants. 
    """
    print(f'Before removing constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # fit finds the features with zero variance
    sel = VarianceThreshold(threshold=0)
    sel.fit(X_train) 
    
    # Get the number of constant features 
    # get_support is a boolean vector that indicates which features are retained.
    # if we sum over get_support, we get the number of features that are not constant
    constant_features = [x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]
    constant_sum = len(constant_features)
    
    print(f'Number of constant features in the data is {constant_sum}')
    print(f'Constant Features : {constant_features}')
                         
    # Transform function to reduce the training and test sets. 
    X_train = sel.transform(X_train)
    X_test = sel.transform(X_test)
        
    print(f'After removing constants: X_train : {X_train.shape}, X_test: {X_test.shape}')
        
    return X_train, X_test

In [6]:
# Alternative way for removing constant features
def remove_constants(X_train, X_test):
    """
    Removes constant features in the dataframe. 
        Parameters:
            X_train (DataFrame): Original Train Set 
            X_test (DataFrame) : Original Test Set
        Returns: 
            X_train, X_test (DataFrame): Train/Test Sets after removing constants. 
    """
    print(f'Before removing constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Get the number of constant features 
    constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
    constant_sum = len(constant_features)
    
    print(f'Number of constant features in the data is {constant_sum}')
    print(f'Constant Features : {constant_features}')
    
    # Drop these columns from the train and test sets
    X_train.drop(labels=constant_features, axis=1, inplace=True)
    X_test.drop(labels=constant_features, axis=1, inplace=True)
    
    print(f'After removing constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test

Both varianceThreshold and the snippet of code, which are provided above, for numerical variables. What can we do to find constant categorical variables?

One alternatively is to encode the categories as numbers and then use the code above. But then you will put effort in pre-processing variables that are not informative.

Alternatively, you can use the code below.

In [None]:
# Removing constant features for categorical variables
def remove_constants(X_train, X_test):
    """
    Removes constant features in the dataframe. 
        Parameters:
            X_train (DataFrame): Original Train Set 
            X_test (DataFrame) : Original Test Set
        Returns: 
            X_train, X_test (DataFrame): Train/Test Sets after removing constants. 
    """
    print(f'Before removing constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')

    # Convert all columns to categorical type
    X_train = X_train.astype('O')
    
    # Find those columns that contain only 1 label
    constant_features = [feat for feat in X_train.columns if len(X_train[feat].unique()) == 1]
    constant_sum = len(constant_features)
    
    print(f'Number of constant features in the data is {constant_sum}')
    print(f'Constant Features : {constant_features}')
    
    # Drop these columns from the train and test sets
    X_train.drop(labels=constant_features, axis=1, inplace=True)
    X_test.drop(labels=constant_features, axis=1, inplace=True)
    
    print(f'After removing constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test

In [None]:
# Remove constant features
X_train, X_test = remove_constants(X_train, X_test)

#### 1.a.2 Remove quasi-constant features

Quasi-constant features are those that show the same value for the great majority of the observations of the dataset. In general, these features provide little if any information that allows a machine learning model to discriminate or predict a target. But there can be exceptions. So you should be careful when removing these types of features. (where a single value occupies more than 99.98% population)

First, remove constant features. This will allow a better visualization of the quasi-constant ones.  To identify constant features, we can use the VarianceThreshold function from sklearn, or we can code it ourselves.

In [9]:
def remove_quasi_constants(X_train, X_test, level=0.01):
    """
    Removes quasi-constant features (those that show the same value 
    for the great majority of the observations of the dataset) in the dataframe. 
        Parameters:
            X_train (DataFrame): Original/Constants Removed Train Set 
            X_test (DataFrame) : Original/Constants Removed Test Set
            level (float): variance threshold. Default value is 0.01, which indicates 99% of observations approximately
        Returns:
            X_train, X_test (DataFrame): Train/Test Sets after removing quasi-constant features 
    """ 
    print(f'Before removing quasi-constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Fit finds the features with low variance
    sel = VarianceThreshold(threshold=level)  
    sel.fit(X_train) 
    
    quasi_constant_features = [x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]
    quasi_constant_sum = len(quasi_constant_features)
    
    print(f'Number of quasi-constant features in the data is {quasi_constant_sum}')
    print(f'Quasi-Constant Features : {quasi_constant_features}')
    
    # Remove quasi-constant features
    X_train = sel.transform(X_train)
    X_test = sel.transform(X_test)
    
    print(f'After removing quasi-constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
        
    return X_train, X_test

In [10]:
# Alternative way
def remove_quasi_constants(X_train, X_test, threshold=0.998):
    """
    Removes quasi-constant features (those that show the same value 
    for the great majority of the observations of the dataset) in the dataframe. 
        Parameters:
            X_train (DataFrame): Original/Constants Removed Train Set 
            X_test (DataFrame) : Original/Constants Removed Test Set
            level (float): variance threshold. Default value is 0.01, which indicates 99% of observations approximately
        Returns:
            X_train, X_test (DataFrame): Train/Test Sets after removing quasi-constant features                                                                                                                                              
    """
    print(f'Before removing quasi-constant features: X_train : {X_train.shape}, X_test: {X_test.shape}')
       
    # Get the number of quasi-constant features
    
    quasi_constant_feat = []
    for feature in X_train.columns:
        # find the predominant value
        predominant = (X_train[feature].value_counts() / np.float(len(X_train))).sort_values(ascending=False).values[0]

        # evaluate predominant feature
        if predominant > threshold:
            quasi_constant_feat.append(feature)
    quasiconstant_sum = len(quasi_constant_feat)
    
    print(f'Number of quasi-constant features in the data is {quasiconstant_sum}')
    print(f'Quasi-Constant Features : {quasi_constant_feat}')
    
    # Drop these columns from the train and test sets
    X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
    X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)
    
    print(f'After removing quasi-constants: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test

In [None]:
# Remove Quasi-constants features
X_train, X_test = remove_quasi_constants(X_train, X_test, level=0.01)

#### 1.a.3 Remove duplicated features

Often datasets contain one or more features that show the same values across all the observations. This means that both features are in essence identical. In addition, it is not unusual to introduce duplicated features after performing one hot encoding of categorical variables, particularly when using several highly cardinal variables.

 *** Duplicated features may arise after one hot encoding of categorical variables.
 
Note: Finding duplicated features is a computationally costly operation in Python, therefore depending on the size of your dataset, you might not always be able to perform it.

In [11]:
# Use this function only for small datasets since transposing data is computationally expensive.

def remove_duplicated(X_train, X_test):
    """
    Removes duplicate features (those that show the same value 
    for the great majority of tduphe observations of the dataset) in the dataframe. 
        Parameters:
            X_train (DataFrame): Original/Quasi-Constants Removed Train Set 
            X_test (DataFrame) : Original/Quasi-Constants Removed Test Set
        Returns:
            X_train, X_test (DataFrame): Train/Test Sets after removing duplicate features   
    """
    print(f'Before removing duplicated features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Transpose the dataframe, so that the columns are the rows of the new dataframe
    data_t = X_train.T
    
    # Check if there are duplicated rows (the columns of the original dataframe)
    # This is a computionally expensive operation, so it might take a while
    duplicated_sum = data_t.duplicated().sum()
    
    print(f'Number of duplicated features in the data is {duplicated_sum}')
    
    duplicated_features = data_t[data_t.duplicated()].index.values
    duplicated_features = list(duplicated_features)
    
    print(f'Duplicated Features are {duplicated_features}')
    
    # Capture the duplicated features and transpose back to original X_train
    X_train = data_t.drop_duplicates(keep='first').T
    new_columns = list(X_train.columns.values)
    X_test = X_test[new_columns]
    
    print(f'After removing duplicated features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test

In [12]:
# Use this function only for BIG datasets.

def remove_duplicated(X_train, X_test):
    """
    Removes duplicate features (those that show the same value 
    for the great majority of tduphe observations of the dataset) in the dataframe. 
        Parameters:
            X_train (DataFrame): Original/Quasi-Constants Removed Train Set 
            X_test (DataFrame) : Original/Quasi-Constants Removed Test Set
        Returns:
            X_train, X_test (DataFrame): Train/Test Sets after removing duplicate features   
    """
    print(f'Before removing duplicated features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Check for duplicated features in the training set
    duplicated_feat = []
    for i in range(0, len(X_train.columns)):
        if i % 10 == 0:  # this helps how the loop is going
            print(i)

        col_1 = X_train.columns[i]

        for col_2 in X_train.columns[i + 1:]:
            if X_train[col_1].equals(X_train[col_2]):
                duplicated_feat.append(col_2)
    
    # Check if there are duplicated rows (the columns of the original dataframe)
    duplicated_sum = len(set(duplicated_feat))
    
    print(f'Number of duplicated features in the data is {duplicated_sum}')
    
    print(f'Duplicated Features are {duplicated_feat}')
    
    # Drop duplicates 
    X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
    X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
    
    print(f'After removing duplicated features: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    return X_train, X_test

In [14]:
# This function should be run before removing duplicates.

def identify_duplicatedpairs(X_train):
    """
    Identify which set of features are identical in the dataset. 
        Parameters:
            X_train (DataFrame): X Training Dataset
        Returns:
            duplicated_feat (list): Duplicated feature pair list in the dataset. 
    """
    duplicated_feat = []
    
    for i in range(0, len(X_train.columns)):

        col_1 = X_train.columns[i]

        for col_2 in X_train.columns[i + 1:]:

            # If the features are duplicated
            if X_train[col_1].equals(X_train[col_2]):

                #Print them
                print(col_1)
                print(col_2)
                print()

                # And then append the duplicated one to a list
                duplicated_feat.append(col_2)
                
    return duplicated_feat

**Putting All Basic Filtering Together**

In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [18]:
def basic_filter(df, target_var, size=0.3, level=0.01):
    """
    Removes constant, quasi-constant and duplicated features in the dataframe. 
        Parameters:
            df (DataFrame): Any DataFrame
            target_var (str): Target variable feature name
            size (float): test size in train-test split. Default value is 0.3
            level (float): variance threshold. Default value is 0.01, which indicates 99% of observations approximately
        Returns:
            X_train, X_test, y_train, y_test (DataFrame): Splitted X and y DataFrames 
                                                          after applying basic filtering.
    """
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(labels=[target_var], axis=1),
        df[target_var], test_size= size, random_state=0)
    
    print(f'Before applying basic filtering: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Remove constant features 
    constant_features = [
        feat for feat in X_train.columns if X_train[feat].std() == 0]
    X_train.drop(labels=constant_features, axis=1, inplace=True)
    X_test.drop(labels=constant_features, axis=1, inplace=True)

    print(f'After removing constant features : X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # remove quasi-constant features
    sel = VarianceThreshold(threshold=level)  
    sel.fit(X_train)  
    
    features_to_keep = X_train.columns[sel.get_support()]
    
    X_train = sel.transform(X_train)
    X_test = sel.transform(X_test)
    
    print(f'After removing quasi-constant features : X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Transform the arrays back to dataframes
    X_train= pd.DataFrame(X_train)
    X_train.columns = features_to_keep

    X_test= pd.DataFrame(X_test)
    X_test.columns = features_to_keep
    
    # check for duplicated features in the training set
    duplicated_feat = []
    for i in range(0, len(X_train.columns)):
        if i % 10 == 0:  # this helps me understand how the loop is going
            print(i)

        col_1 = X_train.columns[i]

        for col_2 in X_train.columns[i + 1:]:
            if X_train[col_1].equals(X_train[col_2]):
                duplicated_feat.append(col_2)
                
    X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
    X_test.drop(labels=duplicated_feat, axis=1, inplace=True)
    
    print(f'After removing duplicated features : X_train : {X_train.shape}, X_test: {X_test.shape}')

    return X_train, X_test, y_train, y_test

### 1.b. Correlation

• Correlation is a measure of the linear relationship of 2 or more variables.

• Through correlation, we can predict one variable from the other.

• Good variables are highly correlated with the target.

• Correlated predictor variables provide redundant information.

• Variables should be correlated with the target but uncorrelated among themselves.

• The central hypothesis is that good feature sets contain features that are highly correlated with the class, yet uncorrelated with each other.

• Correlated features do not necessarily affect model accuracy per se. High dimensionality does.

• If 2 features are highly correlated, the second one will add little information over the previous one: removing it helps reduce dimension. 

• Correlation affects model interpretability: linear models.

• Different classifiers show different sensitivity to correlation.

• Pearson’s coefficient values vary between -1 and 1:

       1 is highly correlated: the more of variable x1, the more of x2
   
      -1 is highly anti-correlated: the more of variable x1, the less of x2

In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

In [21]:
def visualize_correlation(X_train):
    """
    Visualise correlated features. It builds the correlation matrix, which examines the 
    correlation of all features (for all possible feature combinations) and then 
    visualise the correlation matrix using seaborn. All the categorical variables are encoded into numbers
        Parameters:
            X_train (DataFrame): X train set
        Returns:
            plot (plot): heatmap correlation matrix. The red squares correspond to highly 
            correlated features (>0.8). The diagonal represents the correlation of a feature 
            with itself, therefore the value is 1.
    """
    # Visualize heatmap
    corrmat = X_train.corr()
    fig, ax = plt.subplots()
    fig.set_size_inches(11,11)
    plot = sns.heatmap(corrmat)
    
    return plot

Correlation Feature Selection evaluates subsets of features on the basis of the following hypothesis: "Good feature subsets contain features highly correlated with the target, yet uncorrelated to each other".

You may demonstrate how to select features based on correlation using 2 procedures. 

The first one is a brute force function that finds correlated features without any further insight. The second procedure finds groups of correlated features. Often, more than 2 features are correlated with each other. We can find groups of 3, 4 or more features that are correlated. By identifying these groups, we can then select from each group, which feature we want to keep, and which ones we want to remove.

The second approach looks to identify groups of highly correlated features. And then, we can make further investigation within these groups to decide which feature we keep and which one we remove.  When investigating groups, we can check the missingness in the correlated group and select the correlated feature which has less missing value to keep in the data, and drop the rest of the correlated features. 

The other approach to build a machine learning algorithm using all the features from the correlated list, and select the more predictive one.

#### 1.b.1 Brute Force Approach

In [23]:
def correlated_cols(X_train, threshold=0.8):
    """
    Select highly correlated features and remove the first feature that is correlated 
    with anything else without any other insight.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            threshold (int): Correlation coefficient. The default value is 0.8.
        Returns:
            col_corr (set): The names of correlated columns. 
    """
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    
    corr_colnum = len(col_corr)
    
    print(f'There are {corr_colnum} features which are highly correlated with others in the dataset.')
    
    # Converting set to list 
    col_corr = list(col_corr)
    
    return col_corr


def remove_correlatedcols(X_train, X_test, col_corr):
    """
    Remove correlated features in the dataset. 
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            X_test (DataFrame): X_test of Any DataFrame
            col_corr (list): The list of correlated columns, originated from above correlated_cols functions. 
        Returns:
            X_train, X_test (DataFrame): Splitted X and y DataFrames after removing correlated features.    
    """
    # Before dropping correlated features
    print(f'Before dropping correlated features: X_train : {X_train.shape} and X_test : {X_test.shape}.')
    
    # Drop correlated features
    X_train.drop(labels=col_corr, axis=1, inplace=True)
    X_test.drop(labels=col_corr, axis=1, inplace=True)

    # Shape of data after dropping correlated features
    print(f'After dropping correlated features: X_train : {X_train.shape} and X_test shape : {X_test.shape}.')
    
    return X_train, X_test

#### 1.b.2 Second Approach

Identify groups of highly correlated features. And then, make further investigation within these groups to decide which feature to keep and which one to remove.

In [24]:
def build_corrdf(X_train):
    """
    Builds a dataframe with the correlation between features. Remember that the absolute value of the correlation
    coefficient is important and not the sign. 
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
        Returns:
            corrmat (DataFrame): Correlation Dataframe between features    
    """ 
    # Build a dataframe with the correlation between features
    corrmat = X_train.corr()
    corrmat = corrmat.abs().unstack() # absolute value of corr coef
    corrmat = corrmat.sort_values(ascending=False)
    corrmat = corrmat[corrmat >= 0.8]
    corrmat = corrmat[corrmat < 1]
    corrmat = pd.DataFrame(corrmat).reset_index()
    corrmat.columns = ['feature1', 'feature2', 'corr']
    
    return corrmat 

In [25]:
def find_corrgroups(corrmat):
    """
    Find groups of correlated features. 
        Parameters:
            corrmat (DataFrame): Correlation Dataframe between features, which is output of the function above.  
        Returns:
            correlated_groups (list): list of correlated features    
    """
    # Builds a dataframe with the correlation between features 
    corrmat = build_corrdf(df, numerical_vars, target_var, size=0.3)
    
    # find groups of correlated features
    grouped_feature_ls = []
    correlated_groups = []

    for feature in corrmat.feature1.unique():
        if feature not in grouped_feature_ls:

            # find all features correlated to a single feature
            correlated_block = corrmat[corrmat.feature1 == feature]
            grouped_feature_ls = grouped_feature_ls + list(
                correlated_block.feature2.unique()) + [feature]

            # append the block of features to the list
            correlated_groups.append(correlated_block)

    print(f'There are {len(correlated_groups)} correlated groups out of {X_train.shape[1]} total features')
    
    return correlated_groups

In [26]:
def visualize_correlated_features(correlated_groups):
    """
    Visualise each correlated group. Some groups contain only 2 correlated features, 
    some other groups present several features that are correlated among themselves.
        Parameters:
            correlated_groups (List): List of correlated features which is output of the function above. 
        Returns:
            None --- > print correlated feature groups
    """
    # Visualize each group
    for group in correlated_groups:
        print(group)
        print()
        
    return 

In [27]:
def explore_correlatedfeat(correlated_groups, ind_num, X_train, correlated_feature1):
    """
    Explore correlated features within one specific group and print missing data.
        Parameters:
            correlated_groups (List): List of correlated features which is output of the function above. 
            ind_num (int): index number of correlated groups list
            X_train (DataFrame): X_train of Any DataFrame
            correlated_feature1 (str): Correlated feature 1 among correlated features
        Returns:
            None ---> prints missing data for correlated features within one specific group.
    """
    # Investigate further features within one group.
    group = correlated_groups[ind_num]
    
    # Among correlated features, we may select the features with less missing data as shown below.
    for feature in list(group.feature2.unique())+[correlated_feature1]: 
        print(X_train[feature].isnull().sum())

    return

In [30]:
# Alternative method
from sklearn.ensemble import RandomForestClassifier

def select_predictivecorr(correlated_groups, ind_num, X_train, correlated_feature1):
    """
    Build a machine learning algorithm using all the features from list above, and select the more predictive one, and 
    remove all the remaining features from this gorup from the dataset. 
        Parameters:
            correlated_groups (List): List of correlated features which is output of the function above. 
            ind_num (int): index number of correlated groups list
            X_train (DataFrame): X_train of Any DataFrame
            correlated_feature1 (str): Correlated feature 1 among correlated features
        Returns:
            importance (DataFrame): Feature importance dataframe. 
    """
    # Investigate further features within one group.
    group = correlated_groups[ind_num]
    
    features = list(group.feature2.unique())+[correlated_feature1] 
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train[features].fillna(0), y_train)

    # Get the feature importance attributed by the Random Forest model 
    # Select the highest feature important one, and remove all the remaining features from this group from the dataset.
    importance = pd.concat([pd.Series(features), 
                            pd.Series(rf.feature_importances_)], axis=1)

    importance.columns = ['feature', 'importance']
    importance.sort_values(by='importance', ascending=False)
    
    return importance

**Note:**

None of the 2 procedures for removing correlated features are perfect, and some correlated features may escape the loops of code. So it might be worthwhile to check that after removing the correlated features, there are no correlated features left in the dataset. If there are, repeat the procedure to remove the remaining ones.

### 1.c Statistical - Ranking Methods

These methods still evaluate each feature individually, in the light of the target, we intend them to predict.  Statistical – ranking methods:

    -	Information Gain
    -	Fisher Score
    -	Univariate Tests
    -	Univariate ROC-AUC / RMSE
    
**Two Steps:**

    1-Rank features based on certain criteria / metric
    2-Select features with highest rankings

**Pros and Cons:**

    -	Fast and not computationally expensive
    -	Does not contemplate feature redundancy.  You would have to screen for duplicated and correlated features in previous or posterior steps. Also, these selection procedures do not contemplate feature interaction. 


#### 1.c.1 Information Gain (Mutual Information)

Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y.

Mutual information measures the information that X and Y share: It measures how much knowing one of these variables reduces uncertainty about the other. For example, if X and Y are independent, then knowing X does not give any information about Y and vice versa, so their mutual information is zero. At the other extreme, if X is a deterministic function of Y and Y is a deterministic function of X then all information conveyed by X is shared with Y: knowing X determines the value of Y and vice versa. As a result, in this case the mutual information is the same as the uncertainty contained in Y (or X) alone, namely the entropy of Y (or X). Moreover, this mutual information is the same as the entropy of X and as the entropy of Y. (A very special case of this is when X and Y are the same random variable.)

Measures the mutual dependence of 2 variables

Determines how similar the joint distribution p(X,Y) is to the products of individual distributions p(X)p(Y)
    - If X and Y are independent, their MI is zero
    - If X is deterministic of Y, the MI is the uncertainty in X.

Using sklearn on a regression and classification problem, mutual information can be used to select features.

In [31]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

**Classification**

In [36]:
def visualize_mi(X_train, y_train):
    """
    Visualize mutual information bar plot for both classification and regression. 
    This function is for classification example. You may shift it to regression by changing mi value as given below
    in parameters section. 
    mi (series): Mutual information values. Use the function below according to classification/regression.
                                            mi default for classification: mi = mutual_info_regression(X_train.fillna(0), y_train)
                                            mi for for regression: mi = mutual_info_regression(X_train.fillna(0), y_train)
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
        Returns:
            None ---> Plort MI in ordered manner.
    """
    # Add the variable names and order the features
    mi = mutual_info_classif(X_train.fillna(0), y_train)
    
    mi = pd.Series(mi)
    mi.index = X_train.columns
    mi.sort_values(ascending=False)
    
    # Plot the ordered MI values per feature
    mi.sort_values(ascending=False).plot.bar(figsize=(20, 8))
    
    return 

Cut-off threshold to select features is arbitrary. One could choose a certain value of MI after studying the plot above. An alternative and most frequent way of selecting features is to select the top 10, or top 20 features, or the features in the the top 10th percentile of the MI value distribution.

In [34]:
def mutualinfo_classification(X_train, y_train, n):
    """
    This function applies only on classification models.
    Calculate the mutual information between the variables and the target.
    This returns the mutual information value of each feature.
    The smaller the value the less information the feature has about the target.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
            n (int): Indicating highest top n features in terms of feature selection. 
                     Generally top 10 or 20 features are selected. 
        Returns:
            selected_features (list): list of selected features according to mutual information. 
    """
    # Select the top n features
    sel_ = SelectKBest(mutual_info_classif, k=n).fit(X_train.fillna(0), y_train)
    X_train.columns[sel_.get_support()]
    X_train = sel_.transform(X_train.fillna(0))
    
    # Define selected features
    selected_features = list(X_train.columns.values)
    
    return selected_features

**Regression**

In [37]:
def mutualinfo_regression(X_train, y_train, percent):
    """
    This function applies only on regression models.
    Calculate the mutual information between the variables and the target.
    This returns the mutual information value of each feature.
    The smaller the value the less information the feature has about the target.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
            percent (int): Indicating highest top percentile of features in terms of feature selection. 
        Returns:
            selected_features (list): list of selected features according to mutual information.    
    """    
    # Select the top n percentile
    sel_ = SelectKBest(mutual_info_regression, percentile=percent).fit(X_train.fillna(0), y_train)
    X_train.columns[sel_.get_support()]
    X_train = sel_.transform(X_train.fillna(0))
    
    # Define selected features
    selected_features = list(X_train.columns.values)
    
    return selected_features

#### 1.c.2 Fisher Score - Chi-square

Compute chi-squared stats between each non-negative feature and class. 

- This score should be used to evaluate categorical variables in a classification task.

It compares the observed distribution of the different classes of target Y among the different categories of the feature, against the expected distribution of the target classes, regardless of the feature categories.

In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile

In [39]:
def fisher_score(X_train, y_train):
    """
    This function is suited for categorical variables. 
    Calculate the chi2 p_value between each of the variables and the target. 
    It returns 2 arrays, one contains the F-Scores which are then evaluated against
    the chi2 distribution to obtain the pvalue, and the pvalues are in the second array.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
        Returns:
            pvalues (Series): P values for each variable. The smaller p-value,
                              the more significant the feature is to predict 
                              the target variable. 
    """
    # Calculate the chi2 and p_value between each of the variables and target. 
    f_score = chi2(X_train.fillna(0), y_train)
    
    # let's add the variable names and order it for clearer visualisation
    pvalues = pd.Series(f_score[1])
    pvalues.index = X_train.columns
    pvalues = pvalues.sort_values(ascending=False)
    
    return pvalues

Keep in mind, that contrarily to MI, where we were interested in the higher MI values, for Fisher score, the smaller the p_value, the more significant the feature is to predict the target in the dataset. 

**Note**

One thing to keep in mind when using Fisher score or univariate selection methods, is that in very big datasets, most of the features will show a small p_value, and therefore look like they are highly predictive. This is in fact an effect of the sample size. So care should be taken when selecting features using these procedures. An ultra tiny p_value does not highlight an ultra-important feature, it rather indicates that the dataset contains too many samples. 

If the dataset contained several categorical variables, we could then combine this procedure with SelectKBest or SelectPercentile.

#### 1.c.3 Univariate Tests - ANOVA

Univariate feature selection works by selecting the best features based on univariate statistical tests (ANOVA). The methods based on F-test estimate the degree of linear dependency between two random variables. They assume a linear relationship between the feature and the target. These methods also assume that the variables follow a Gaussian distribution.

These may not always be the case for the variables in your dataset, so if looking to implement these procedure, you will need to corroborate these assumptions.

In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

**Classification**

In [42]:
# For classification models
def univariate_anova(X_train, y_train, n):
    """
    Calculate the univariate statistical measure between each of the variables and the target
    similarly to chi2, the output is the array of f-scores and an array of pvalues, 
    which are the ones we will compare.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
            n (int): Indicating highest top n features in terms of feature selection. 
                     Generally top 10 or 20 features are selected. 
        Returns:
            univariate_plot (plot): Plot for p value for each feature. 
            selected_features (list): list of selected features according to univariate tests-ANOVA
    """    
    # Calculate the univariate statistical measure between each of the variables and the target
    univariate = f_classif(X_train.fillna(0), y_train)
    
    # Add the variable names and order it for clearer visualisation
    univariate = pd.Series(univariate[1])
    univariate.index = X_train.columns
    univariate.sort_values(ascending=False, inplace=True)
    
    # Plot the p values
    univariate_plot = univariate.sort_values(ascending=False).plot.bar(figsize=(20, 8))
    
    # Select the top n features
    sel_ = SelectKBest(f_classif, k=n).fit(X_train.fillna(0), y_train)
    X_train.columns[sel_.get_support()]
    X_train = sel_.transform(X_train.fillna(0))
    
    # Define selected features
    selected_features = list(X_train.columns.values)
    
    return univariate_plot, selected_features

Remember that the lower the p_value, the most predictive the feature is in principle. There are a few features that do not seem to have predictive power according to this tests, which are those on the left with pvalues above 0.05. Given that typically in statistics one uses a pvalue of 0.05 as a cut-off, generally believed taht those features with pvalue > 0.05 are indeed not important. However, keep in mind that this test assumes a linear relationship, so it might also be the case that the feature is related to the target but not in a linear manner.

Further investigation is needed if we want to know the true nature of the relationship between feature and target.

In big datasets, it is not unusual that the p-values of the different features are really small. This does not say as much about the relevance of the feature. Mostly it indicates that it is a big the dataset.

Once again, where we put the cut-off to select features is a bit arbitrary. One way is to select the top 10, 20 features. Alternatively, the top 10th percentile. For this, you can use anova in combination with SelectKBest or SelectPercentile from sklearn. 

**Regression**

In [44]:
# For regression models
def univariate_anova(X_train, y_train, n):
    """
    Calculate the univariate statistical measure between each of the variables and the target
    similarly to chi2, the output is the array of f-scores and an array of pvalues, 
    which are the ones we will compare.
        Parameters:
            X_train (DataFrame): X_train of Any DataFrame
            y_train (DataFrame): y_train of Any DataFrame
            percent (int): Indicating highest top percentile of features in terms of feature selection. 
        Returns:
            univariate_plot (plot): Plot for p value for each feature. 
            selected_features (list): list of selected features according to univariate tests-ANOVA
    """    
    # Calculate the univariate statistical measure between each of the variables and the target
    univariate = f_regression(X_train.fillna(0), y_train)
    
    # Add the variable names and order it for clearer visualisation
    univariate = pd.Series(univariate[1])
    univariate.index = X_train.columns
    univariate.sort_values(ascending=False, inplace=True)
    
    # Plot the p values
    univariate_plot = univariate.sort_values(ascending=False).plot.bar(figsize=(20, 8))
    
    # Select the top n percentile
    sel_ = SelectPercentile(f_regression, percentile=percent).fit(X_train.fillna(0), y_train)
    X_train.columns[sel_.get_support()]
    X_train = sel_.transform(X_train.fillna(0))
    
    # Define selected features
    selected_features = list(X_train.columns.values)
    
    return univariate_plot, selected_features

#### 1.c.4 Univariate - ROC AUC / RMSE

This procedure works as follows:

- First, it builds one decision tree per feature, to predict the target
- Second, it makes predictions using the decision tree and the mentioned feature
- Third, it ranks the features according to the machine learning metric (roc-auc or mse)
- It selects the highest ranked features

In [45]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

**Classification**

In [46]:
def univariate_rocauc(X_train, X_test, y_train, y_test, cut_off=0.5):
    """
    Measures the dependence of 2 variables via using machine learning. 
    Suited for all types of variables. 
    Makes no assumption on the distribution of the variables.
    The higher ROC AUC score, variable is better to predict the target variable.
        Parameters:
            X_train, X_test, y_train, y_test (DataFrame): Splitted X and y DataFrames for training/set sets. 
            cut_off (float): threshold for ROC-AUC score. The default value is 0.5 which indicates
                             below that thresold is worse than random. 
        Returns:
            roc_plot (plot): roc values plot in descending order. 
            selected_featureslist (list): list of selected features according to univariate tests-ROC AUC
    """
    # loop to build a tree, make predictions and get the roc-auc for each feature of the train set
    roc_values = []
    for feature in X_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(X_train[feature].fillna(0).to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].fillna(0).to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    
    # Add the variable names and order it for clearer visualisation
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    roc_values = roc_values.sort_values(ascending=False)
    
    # Plot
    roc_plot = roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
    
    # Check how many features show a roc-auc value higher than random
    # a roc auc value of 0.5 indicates random decision
    selected_features = roc_values[roc_values > cut_off]
    print(f'{len(selected_features)} features show a roc-auc value higher than {cut_off}.')
    
    selected_featureslist = selected_features.tolist()
    
    return roc_plot, selected_featureslist

You may tune the parameters of the Decision Tree and get better predictions. It is up to you. But remember that the key here is not to make ultra predictive Decision Trees, rather to use them to screen quickly for important features. So I would recommend you don't spend too much time tuning. Doing cross validation with sklearn should be very straight forward to get a more accurate measure of the roc-auc per feature.

Once again, where we put the cut-off to select features is a bit arbitrary, other than > 0.5. It will be up to you.

**Regression**

In [48]:
def univariate_rmse(X_train, X_test, y_train, y_test, cut_off=0.5):
    """
    Measures the dependence of 2 variables via using machine learning. 
    Suited for all types of variables. 
    Makes no assumption on the distribution of the variables.
    The lower RMSE score, variable is better  to predict the target variable.
        Parameters:
            X_train, X_test, y_train, y_test (DataFrame): Splitted X and y DataFrames for training/set sets. 
            cut_off (float): threshold for RMSE score. For the mse, where to put the 
                             cut-off is arbitrary. It depends on how many features 
                             you would like to end up with.
        Returns:
            mse_plot (plot): mse values plot in descending order. 
            selected_featureslist (list): list of selected features according to univariate tests-RMSE
    """ 
     # loop to build a tree, make predictions and get the mse for each feature of the train set
    mse_values = []
    for feature in X_train.columns:
        clf = DecisionTreeRegressor()
        clf.fit(X_train[feature].fillna(0).to_frame(), y_train)
        y_scored = clf.predict(X_test[feature].fillna(0).to_frame())
        mse_values.append(mean_squared_error(y_test, y_scored))
    
    # Add the variable names and order it for clearer visualisation
    mse_values = pd.Series(mse_values)
    mse_values.index = X_train.columns
    mse_values.sort_values(ascending=False)
    
    # Plot
    mse_plot = mse_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
    
    # Check how many features show a rmse value lower than cut-off value
    selected_features = mse_values[mse_values < cut_off]
    print(f'{len(selected_features)} features show a rmse value lower than {cut_off}.')
    
    selected_featureslist = selected_features.tolist()
    
    return mse_plot, selected_featureslist

Remember that for regression, the smaller the mse, the better the model performance is. So in this case, we need to select from the right to the left.

For the mse, where to put the cut-off is arbitrary as well. It depends on how many features you would like to end up with.

It is good practice to use this method when you have an enormous amount of features and need to start reducing the feature space quickly.

#### 1.c.5 Select Features by mean encoding (Alternative Filter Methods: Alternative non-mainstream method)

I will describe the feature selection approach undertaken by data scientists at the University of Melbourne for the [KDD 2009](http://www.kdd.org/kdd-cup/view/kdd-cup-2009) data science competition. The task consisted in predicting churn based on a dataset with a huge number of features.

The authors describe this procedure as an aggressive non-parametric feature selection procedure, that is based in contemplating the relationship between the feature and the target. Therefore, this method should be classified as a filter method.

**The procedure consists in the following steps**:

For each categorical variable:

    1) Separate into train and test

    2) Determine the mean value of the target within each label of the categorical variable using the train set

    3) Use that mean target value per label as the prediction in the test set and calculate the roc-auc.

For each numerical variable:

    1) Separate into train and test
    
    2) Divide the variable into 100 quantiles

    3) Calculate the mean target within each quantile using the training set 

    4) Use that mean target value / bin as the prediction on the test set and calculate the roc-auc


The authors quote the following advantages of the method:

- Speed: computing mean and quantiles is direct and efficient
- Stability respect to scale: extreme values for continuous variables do not skew the predictions
- Comparable between categorical and numerical variables
- Accommodation of non-linearities

In [47]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

**For categorical features**

Examples mentioned below is given based on the Titanic Dataset for understanding better. 

First, feature selection procedure over categorical variables will be demonstrated. The Titanic dataset contains 4 categorical variables, which are Sex, Pclass, Cabin and Embarked, which is defined as `categorical_list` in the function.

In the next cell I create a function that calculates the mean of Survival (and this is equivalent to the probability of survival) of the passenger, within each label of a categorical variable. It creates a dictionary, using the training set only, that maps each label of the training set variable, to a probability of survival.

Then, the function replaces the label both in train and test set, by the probability of survival. It is like making a prediction on the outcome, by using only the label of the variable.

In this way, the function replaces the original strings, by probabilities.

The bottom line of this method is that we use just the label of the variable to estimate the probability of survival of the passenger. A bit like "Tell me which one was your Cabin, and I will tell you your probability of Survival".

If the labels of a categorical variable and therefore the categorical variable are good predictors, then, we should obtain a roc-auc above 0.5 for that variable, when we evaluate those probabilities with the real outcome, which is whether the passenger survived or not.

In [50]:
def mean_encoding(df_train, df_test, categorical_list, target_var):
    """
    Calculates the mean of target variable (and this is equivalent to the probability of target variable) of the each record, 
    within each label of a categorical variable. It creates a dictionary, using the training set only, 
    that maps each label of the training set variable, to a probability of target variable.
        Parameters:
            df_train (DataFrame): Train Dataset. 
            df_test (DataFrame): Test Dataset
            categorical_list (List): List of categorical features in the dataset. 
            target_var (str): Name of the target variable 
        Returns:
            df_train_temp (DataFrame): X_train after mean encoding function
            df_test_temp (DataFrame): X_test after mean encoding function
    """
    # temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    for col in categorical_list:
        # make a dictionary mapping labels / categories to the mean target for that label
        risk_dict = df_train.groupby([col])[target_var].mean().to_dict()
        
        # re-map the labels
        df_train_temp[col] = df_train[col].map(risk_dict)
        df_test_temp[col] = df_test[col].map(risk_dict)
    
    # drop the target
    df_train_temp.drop(['Survived'], axis=1, inplace=True)
    df_test_temp.drop(['Survived'], axis=1, inplace=True)     
    
    return df_train_temp, df_test_temp

In [51]:
def calculate_rocauc(categorical_list, y_test, df_test_temp):
    """
    Use that mean target value per label as the prediction in the test set 
    and calculate the roc-auc, using the probabilities that we used to
    replace the labels, and comparing it with the true target.
        Parameters:
            df_test_temp(DataFrame): X_test data after categorical variables are mapped
                                     with labels.  
            y_test (array): y_test after train and test split.         
            categorical_list (list): list of categorical features in the DataFrame.
        Returns:
            m1 (Series): ROC-AUC score series for each categorical feature.  
    """
    roc_values = []
    for feature in categorical_list:
        roc_values.append(roc_auc_score(y_test, df_test_temp[feature]))
    
    # Make series for easy visualization
    m1 = pd.Series(roc_values)
    m1.index = categorical_list
    m1.sort_values(ascending=False)
    
    return m1

**For numerical features**

The procedure is exactly the same, but it requires one additional first step which is to divide the continuous variable into bins. The authors of the method divide the variable in 100 quantiles, that is 100 bins. In principle, you could divide the variable in less bins.

In [52]:
def mean_encoding(n, feature, X_train, X_test, y_test):
    """
    Calculates the mean of target variable (and this is equivalent to the probability of target variable) of the each record, 
    within each label of a categorical variable. It creates a dictionary, using the training set only, 
    that maps each label of the training set variable, to a probability of target variable.
        Parameters:
            n (int): number of bin labels
            feature (str): Specific numerical feature
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_test (DataFrame): y_test set            
        Returns:
            score (float): ROC-AUC value
    """
    # Divide specific numerical feature in n bins. Use the qcut (quantile cut)
    # function from pandas, which indicates that you want n-1 cutting points, thus 10 bins.
    # retbins= True indicates that you want to capture the limits of
    # each interval (so I can then use them to cut the test set)

    # create 10 labels, one for each quantile
    # instead of having the quantile limits, the new variable
    # will have labels in its bins
    labels = ['Q' + str(i + 1) for i in range(0, n)]

    X_train['feature_binned'], intervals = pd.qcut(
        X_train[feature],
        n,
        labels=labels,
        retbins=True,
        precision=3,
        duplicates='drop')

    # Use the boundaries calculated above to bin the testing set
    X_test['feature_binned'] = pd.cut(x = X_test[feature], bins=intervals, labels=labels)
    
    # in order to replace the NaN values by a new category
    # called "Missing", first need to recast the variables as objects
    X_train['feature_binned'] = X_train['feature_binned'].astype('O')
    X_test['feature_binned'] = X_test['feature_binned'].astype('O')
    
    # Replace the missing values with a new category
    X_train['feature_binned'].fillna('Missing', inplace=True)
    X_test['feature_binned'].fillna('Missing', inplace=True)
    
    # Create a dictionary that maps the bins to the mean of survival
    risk_dict = X_train.groupby(['feature_binned'])['Survived'].mean().to_dict()

    # Re-map the labels, and replace the bins by the probability of survival
    X_train['feature_binned'] = X_train['feature_binned'].map(risk_dict)
    X_test['feature_binned'] = X_test['feature_binned'].map(risk_dict)    
    
    # Calculate a roc-auc value, using the probabilities that we used to
    # replace the labels, and comparing it with the true target:
    score = roc_auc_score(y_test, X_test['feature_binned'])
    
    return score

The authors mention that by using this method, you are able to compare directly numerical with categorical variables. In a sense this is true, however we need to keep in mind, that categorical variables may or may not (and typically they will not) show the same percentage of observations per label. However, when we divide a numerical variable into quantile bins, we guarantee that each bin shows the same percentage of observations.

Alternatively, instead of binning into quantiles, we can bin into equal-distance bins.The way to do this, is to calculate the max value - min value range and divide that distance into the amount of bins we want to construct. That would determine the cut-points for the bins.

## 2. Wrapper Methods:

- Greedy search algorithms.

- Utilize a specific classifier to select the optimal set of features

- Sequential feature selection algorithms add or remove one feature at the time based on the classifier performance until a feature subset of the desired size k is reached, or any other desired criteria is met

### 2.a Step Forward Feature Selection

Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d.

Step forward feature selection starts by evaluating all features individually and selects the one that generates the best performing algorithm, according to a pre-set evaluation criteria. In the second step, it evaluates all possible combinations of the selected feature and a second feature, and selects the pair that produce the best performing algorithm based on the same pre-set criteria.

The pre-set criteria can be the roc_auc for classification and the r squared for regression for example.

This selection procedure is called greedy, because it evaluates all possible single, double, triple and so on feature combinations. Therefore, it is quite computationally expensive, and sometimes, if feature space is big, even unfeasible.

There is a special package for python that implements this type of feature selection: `mlxtend`.

In the mlxtend implementation of the step forward feature selection, the stopping criteria is an arbitrarily set number of features. So the search will finish when we reach the desired number of selected features.

This is somewhat arbitrary because we may be selecting a subopimal number of features, or likewise, a high number of features.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
# Before starting, find and remove correlated features in the dataset and then 
# execute step forward selection --- Look at the Filter-Correlation section!

**Classification**

In [None]:
def stepforwardselect(X_train, y_train, 
                      performance_score='roc_auc', n=10):
    """
    Step forward feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal roc_auc
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For classification dataset default value is 'roc_auc'
            n (int): Total number of selected features. Default value is 10.              
        Returns:
            selected_feat (list): list of selected features according to step forward feature selection
    """
    # Step forward feature selection by selectin n features from the total, and 
    # that select those features based on the optimal roc_auc
    sfs1 = SFS(RandomForestClassifier(n_jobs=4), 
               k_features=n, 
               forward=True,
               floating=False, 
               verbose=2,
               scoring=performance_score,
               cv=3)
    
    sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)
    selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]

    return selected_feat

**Regression**

In [None]:
def stepforwardselect(X_train, y_train, 
                      performance_score='r2', n=10):
    """
    Step forward feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal r2
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For classification dataset default value is 'r2'
            n (int): Total number of selected features. Default value is 10.              
        Returns:
            selected_feat (list): list of selected features according to step forward feature selection
    """
    # Step forward feature selection by selectin n features from the total, and 
    # that select those features based on the optimal roc_auc
    sfs1 = SFS(RandomForestRegressor(), 
               k_features=n, 
               forward=True,
               floating=False, 
               verbose=2,
               scoring=performance_score,
               cv=3)
    
    sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)
    selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]

    return selected_feat

### 2.b Step Backwards Feature Selection

Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d.

Step backward feature selection starts by fitting a model using all features. Then it removes one feature. It will remove the one that produces the highest performing algorithm for a certain evaluation criteria. In the second step, it will remove a second feature, the one that again produces the best performing algorithm. And it proceeds, removing feature after feature, until a certain criteria is met.

The pre-set criteria can be the roc_auc for classification and the r squared for regression for example.

This selection procedure is called greedy, because it evaluates all possible n, and then n-1 and n-2 and so on feature combinations. Therefore, it is quite computationally expensive, and sometimes, if feature space is big, even unfeasible.

There is a special package for python that implements this type of feature selection:` mlxtend`.

In the mlxtend implementation of the step backward feature selection, the stopping criteria is an arbitrarily set number of features. So the search will finish when we reach the desired number of selected features.

This is somewhat arbitrary because we may be selecting a subopimal number of features, or likewise, a high number of features.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
# Before starting find and remove correlated features --- Look at the Filter-Correlation section!

**Classification**

In [None]:
def stepbackwardselect(X_train, y_train, 
                      performance_score='roc_auc', n=10):
    """
    Step Backwards feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal roc_auc
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For classification dataset default value is 'roc_auc'
            n (int): Total number of selected features. Default value is 10.  
        Returns:
            selected_feat (list): list of selected features according to step backward feature selection
    """
    # Step backwards feature selection by selectin n features from the total, and 
    # that select those features based on the optimal roc_auc/r2
    sfs1 = SFS(selection_model, 
               k_features=n, 
               forward=False, 
               floating=False, 
               verbose=2,
               scoring=performance_score,
               cv=3)

    sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)
    selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]

    return selected_feat

**Regression**

In [None]:
def stepbackwardselect(X_train, y_train, 
                      performance_score='r2', n=10):
    """
    Step Backwards feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal r2
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For regression dataset default value is 'r2'
            n (int): Total number of selected features. Default value is 10.  
        Returns:
            selected_feat (list): list of selected features according to step backward feature selection
    """
    # Step backwards feature selection by selectin n features from the total, and 
    # that select those features based on the optimal r2
    sfs1 = SFS(RandomForestRegressor(), 
               k_features=n, 
               forward=False, 
               floating=False, 
               verbose=2,
               scoring=performance_score,
               cv=3)

    sfs1 = sfs1.fit(np.array(X_train.fillna(0)), y_train)
    selected_feat= X_train.columns[list(sfs1.k_feature_idx_)]

    return selected_feat

### 2.c Exhaustive Feature Selection

Sequential feature selection algorithms are a family of greedy search algorithms that are used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d.

In an exhaustive feature selection the best subset of features is selected, over all possible feature subsets, by optimizing a specified performance metric for a certain machine learning algorithm. For example, if the classifier is a logistic regression and the dataset consists of 4 features, the algorithm will evaluate all 15 feature combinations as follows:

- all possible combinations of 1 feature
- all possible combinations of 2 features
- all possible combinations of 3 features
- all the 4 features
- and select the one that results in the best performance (e.g., classification accuracy) of the logistic regression classifier.

This is another greedy algorithm as it evaluates all possible feature combinations. It is quite computationally expensive, and sometimes, `if feature space is big, even unfeasible`.

There is a special package for python that implements this type of feature selection: `mlxtend`.

In the mlxtend implementation of the exhaustive feature selection, the stopping criteria is an arbitrarily set number of features. So the search will finish when we reach the desired number of selected features.

This is somewhat arbitrary because we may be selecting a subopimal number of features, or likewise, a high number of features.

In [None]:
# Find and remove correlated features in order to reduce the feature space a bit so that the algorithm takes shorter. 

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

**Classification**

In [None]:
def exhaustiveselect(X_train, y_train, n_max,
                     performance_score='roc_auc', n_min=1):
    """
    Exhaustive feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal roc_auc
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For classification dataset, the default value is 'roc_auc'
            n_min (int): Minimum number of selected features. Default value is 1.
            n_max (int): Maximum number of selected features. 
        Returns:
            selected_feat (list): list of selected features according to exaustive feature selection
    """
    # Exhaustive feature selection by selecting n features from the total, and 
    # that select those features based on the optimal roc_auc/r2
    efs1 = EFS(RandomForestClassifier(n_jobs=4), 
               min_features=n_min,
               max_features=n_max, 
               scoring=performance_score,
               print_progress=True,
               cv=2)

    efs1 = efs1.fit(np.array(X_train[X_train.columns[0:n_max]].fillna(0)), y_train)

    selected_feat= X_train.columns[list(efs1.best_idx_)]

    return selected_feat

**Regression**

In [None]:
def exhaustiveselect(X_train, y_train, n_max,
                     performance_score='r2', n_min=1):
    """
    Exhaustive feature selection which indicates that to select n features from
    the total, and that to select those features based on the optimal r2
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set
            performance_score (str): For regression dataset, the default value is 'r2'
            n_min (int): Minimum number of selected features. Default value is 1.
            n_max (int): Maximum number of selected features. 
        Returns:
            selected_feat (list): list of selected features according to exaustive feature selection
    """
    # Exhaustive feature selection by selecting n features from the total, and 
    # that select those features based on the optimal r2
    efs1 = EFS(RandomForestRegressor(n_jobs=4), 
               min_features=n_min,
               max_features=n_max, 
               scoring=performance_score,
               print_progress=True,
               cv=3)

    efs1 = efs1.fit(np.array(X_train[X_train.columns[0:n_max]].fillna(0)), y_train)

    selected_feat= X_train.columns[list(efs1.best_idx_)]

    return selected_feat

This exhaustive search is very computationally expensive. Unless you have access to a multicore or distributed computer system I don't see how you can get the most out of it. 

## 3. Embedded Methods

### 3.a Lasso Regularization

Regularisation consists in adding a penalty to the different parameters of the machine learning model to reduce the freedom of the model and in other words to avoid overfitting. In linear model regularisation, the penalty is applied over the coefficients that multiply each of the predictors. From the different types of regularisation, Lasso or l1 has the property that is able to shrink some of the coefficients to zero. Therefore, that feature can be removed from the model.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

**Classification**

In [None]:
def lasso_regularization(X_train, y_train):
    """
    Scale features and use lasso regularization to shrink the coefficients of unimportant features. 
    This function is for only classification dataset. 
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set      
        Returns:
            X_train_selected (array): X_train after the dataset is splitted as train/test set and removed 
                                      redundant features via applying lasso regularization technique.
            X_test_selected (array): X_test after the dataset is splitted as train/test set and removed 
                                     redundant features via applying lasso regularization technique.
    """
    print(f'Data shape before applying lasso regularization:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # linear models benefit from feature scaling
    scaler = StandardScaler()
    scaler.fit(X_train.fillna(0))
    
    # Use the selectFromModel object from sklearn, which will select in theory 
    # the features which coefficients are non-zero
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
    sel_.fit(scaler.transform(X_train.fillna(0)), y_train)
    
    # Make a list with the selected features
    selected_feat = X_train.columns[(sel_.get_support())]

    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients shrank to zero: {}'.format(
        np.sum(sel_.estimator_.coef_ == 0)))
    
    # Identify the removed features
    removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
    print('Removed features: {}'.format(removed_feats))
    
    # Remove the features from the training and testing set
    X_train_selected = sel_.transform(X_train.fillna(0))
    X_test_selected = sel_.transform(X_test.fillna(0))

    print(f'Data shape before applying lasso regularization:\n
          X_train : {X_train_selected.shape}, X_test: {X_test_selected.shape}')

    return X_train_selected, X_test_selected

**Regression**

In [None]:
def lasso_regularization(X_train, y_train):
    """
    Scale features and use lasso regularization to shrink the coefficients of unimportant features. 
    This function is for only regression dataset. 
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set      
        Returns:
            X_train_selected (array): X_train after the dataset is splitted as train/test set and removed 
                                      redundant features via applying lasso regularization technique.
            X_test_selected (array): X_test after the dataset is splitted as train/test set and removed 
                                     redundant features via applying lasso regularization technique.
    """
    print(f'Data shape before applying lasso regularization:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # linear models benefit from feature scaling
    scaler = StandardScaler()
    scaler.fit(X_train.fillna(0))
    
    # Use the selectFromModel object from sklearn, which will select in theory 
    # the features which coefficients are non-zero
    sel_ = SelectFromModel(Lasso(alpha=100))
    sel_.fit(scaler.transform(X_train.fillna(0)), y_train)
    
    # Make a list with the selected features
    selected_feat = X_train.columns[(sel_.get_support())]

    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients shrank to zero: {}'.format(
        np.sum(sel_.estimator_.coef_ == 0)))
    
    # Identify the removed features
    removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
    print('Removed features: {}'.format(removed_feats))
    
    # Remove the features from the training and testing set
    X_train_selected = sel_.transform(X_train.fillna(0))
    X_test_selected = sel_.transform(X_test.fillna(0))

    print(f'Data shape before applying lasso regularization:\n
          X_train : {X_train_selected.shape}, X_test: {X_test_selected.shape}')

    return X_train_selected, X_test_selected

As we can see, both for linear and logistic regression we used the Lasso regularisation to remove non-important features from the dataset. Keep in mind that increasing the penalisation will increase the number of features removed. Therefore, you will need to keep an eye and monitor that you don't set a penalty too high so that to remove even important features, or too low and then not remove non-important features.

Having said this, if the penalty is too high and important features are removed, you should notice a drop in the performance of the algorithm and then realise that you need to decrease the regularisation.

## 3.b Regression Coefficients

Linear regression is a straightforward approach for predicting a quantitative response Y on the basis of a different predictor variable X1, X2, ... Xn. It assumes that there is a linear relationship between X(s) and Y. Mathematically, we can write this linear relationship as Y ≈ β0 + β1X1 + β2X2 + ... + βnXn.

**The magnitude of the coefficients is directly influenced by the scale of the features**. Therefore, to compare coefficients across features, it is importance to have all features within the same scale. This is why, normalisation is important for variable importance and feature selection in linear models. Normalisation is important as well for model performance.

In addition, Linear Regression makes the following assumptions over the predictor variables X:
- Linear relationship with the outcome Y
- Multivariate normality (X should follow a Gaussian distribution)
- No or little multicollinearity (Xs should not be linearly related to one another)
- Homoscedasticity (variance should be the same)

Homoscedasticity, also known as homogeneity of variance, describes a situation in which the error term (that is, the “noise” or random disturbance in the relationship between the independent variables (Xs) and the dependent variable (Y)) is the same across all values of the independent variables.

Therefore, there are a lot of assumptions that need to be met in order to make a fair comparison of the features by using only their regression coefficients.

In addition, these coefficients may be penalised by regularisation, therefore being smaller than if we were to compare only that individual feature with the target.

Having said this, you can still select features based on linear regression coefficients, provided you keep all of these in mind at the time of analysing the outcome.

Personally, this is not favorite selection method of choice, although it useful to interpret the output of the model.

**Linear Regression Coefficients**

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [55]:
def linear_coefficients_regression(X_train, y_train, binsize=50):
    """
    Train a Linear regression and selectfeatures with higher coefficients all in one line of code.
    The LinearRegression object from sklearn is a non-regularised linear method. 
    It fits by matrix multiplication and not gradient descent. Therefore no need to specify penalty and other parameters.
    Select features which coefficient are greater than the mean of all feature coefficients.
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set  
            binsize (int): Bin in the histogram. Default value is 50.
        Returns:
            selected_feat (list): List of features which are selected after applying linear 
                                  coefficients feature selection technique. 
            selected_plot (plot): Histogram plot for selected features showing coefficient 
    """
    print(f'Before applying linear coefficients feature selection technique: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # linear models benefit from feature scaling
    scaler = StandardScaler()
    scaler.fit(X_train.fillna(0))
    
    # Train a Linear regression and select features with higher coefficients
    sel_ = SelectFromModel(LinearRegression())
    sel_.fit(scaler.transform(X_train.fillna(0)), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    # Compare the amount selected features with the amount of features 
    # which coefficient is above the mean coefficient
    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(np.abs(sel_.estimator_.coef_) > np.abs(sel_.estimator_.coef_).mean())))
    
    # Plot the histogram of absolute coefficients
    select_plot = pd.Series(np.abs(sel_.estimator_.coef_).ravel()).hist(bins=binsize)
    
    print(f'After applying linear coefficients feature selection technique: X_train : {X_train[selected_feat].shape}, X_test: {X_test[selected_feat].shape}')    

    return selected_feat, selected_plot

**Logistic Regression Coefficients**

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [56]:
def linear_coefficients_classif(X_train, y_train, size=0.3):
    """
    Specify the Logistic Regression model, and select the Ridge Penalty (l2). 
    Evaluate the coefficient magnitude itself and not whether lasso shrinks coefficients to zero
    Avoid regularisation at all, so the coefficients are not affected (modified) by the penalty of the regularisation
    Use the selectFromModel object from sklearn to automatically select the features.
    It will select all the coefficients which absolute values are greater than the mean.
    We can of course select a different threshold. Visit the documentation in sklearn to learn how to change this parameter.
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set   
        Returns:
            selected_feat (list): List of features which are selected after applying linear 
                                  coefficients feature selection technique. 
            selected_plot (plot): Plot for selected features showing coefficient 
    """
    # Separate train and test sets
    print(f'Before applying logistic regression coefficients feature selection technique: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # linear models benefit from feature scaling
    scaler = StandardScaler()
    scaler.fit(X_train.fillna(0))
    
    # Do the model fitting and feature selection
    # Set C to 1000, to avoid regularisation. 
    # Evaluate the coefficient magnitude itself and not whether lasso shrinks coefficients to zero
    sel_ = SelectFromModel(LogisticRegression(C=1000, penalty='l2')) 
    sel_.fit(scaler.transform(X_train.fillna(0)), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    # Compare the amount selected features with the amount of features 
    # which coefficient is above the mean coefficient
    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(np.abs(sel_.estimator_.coef_) > np.abs(sel_.estimator_.coef_).mean())))
    
    select_plot = pd.Series(np.abs(sel_.estimator_.coef_).ravel()).hist()
    
    return select_plot, selected_feat

## 3.c Tree Derived Variable Importance

Decision trees 
- Most popular machine learning algorithms 
- Highly accurate 
- Good generalization (low overfitting) 
- Interpretability

### 3.c.1 Random Forest Importance

- Random Forests consist of several hundreds of individual decision trees 
- The impurity decrease for each feature is averaged across trees

Limitations:
    
- Correlated features show equal or similar importance 
- Correlated features importance is lower than the real importance, determined when tree is built in absence of correlated counterparts 
- Highly cardinal variables show greater importance (trees are biased to this type of variables)

Procedure:
    
Build a random forest 
- Determine feature importance 
- Select the features with highest importance 
- There is a scikit-learn implementation for this

Recursive feature elimination 
- Build random forests 
- Calculate feature importance 
- Remove least important feature 
- Repeat till a condition is met

If the feature removed is correlated to another feature in the dataset, by removing the correlated feature, the true importance of the other feature will be revealed its importance will increase.

Random forests are one the most popular machine learning algorithms. They are so successful because they provide in general a good predictive performance, low overfitting and easy interpretability. This interpretability is given by the fact that it is straightforward to derive the importance of each variable on the tree decision. In other words, it is easy to compute how much each variable is contributing to the decision.

Random forests consist of 4-12 hundred decision trees, each of them built over a random extraction of the observations from the dataset and a random extraction of the features. Not every tree sees all the features or all the observations, and this guarantees that the trees are de-correlated and therefore less prone to over-fitting. Each tree is also a sequence of yes-no questions based on a single or combination of features. At each node (this is at each question), the three divides the dataset into 2 buckets, each of them hosting observations that are more similar among themselves and different from the ones in the other bucket. Therefore, the importance of each feature is derived by how "pure" each of the buckets is.

For classification, the measure of impurity is either the Gini impurity or the information gain/entropy. For regression the measure of impurity is variance. Therefore, when training a tree, it is possible to compute how much each feature decreases the impurity. The more a feature decreases the impurity, the more important the feature is. In random forests, the impurity decrease from each feature can be averaged across trees to determine the final importance of the variable.

To give you a better intuition, features that are selected at the top of the trees are in general more important than features that are selected at the end nodes of the trees, as generally the top splits lead to bigger information gains.

Note
- Random Forests and decision trees in general give preference to features with high cardinality
- Correlated features will be given equal or similar importance, but overall reduced importance compared to the same tree built without correlated counterparts.

Where we put the cut-off to select features is a bit arbitrary. One way is to select the top 10, 20 features. Alternatively, the top 10th percentile. For this, you can use mutual info in combination with SelectKBest or SelectPercentile from sklearn. 
Selecting features by using tree derived feature importance is a very srtaightforward, fast and generally accurate way of selecting good features for machine learning. In particular, if you are going to build tree methods.

However, as I said, correlated features will show in a tree similar and lowered importance, compared to what their importance would be if the tree was built without correlated counterparts. In situations like this, it is better to select features recursively, rather than altogether.

#### 3.c.1.a Embedded Random Forest Feature Importance

In [57]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

**Classification**

In [None]:
def randomforest_select(X_train, y_train, num_trees=100):
    """
    Specify the Random Forest instance, indicating the number of trees.
    Use the selectFromModel object from sklearn to automatically select the features
    SelectFrom model will select those features which importance is greater than the mean importance of all the features 
    by default, but you can alter this threshold if you want to.
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set   
            num_trees (int): The number of trees in the forest. The default value is 100.
        Returns:
            selected_feat (list): List of features which are selected after applying linear 
                                  coefficients feature selection technique.
            importance_plot (plot): Plot for feature importance.      
    """
    print(f'Before applying random forest feature selection technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Do the model fitting and feature selection
    sel_ = SelectFromModel(RandomForestClassifier(n_estimators=num_trees))
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    # Compare the amount selected features with the amount of features 
    # which coefficient is above the mean coefficient
    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(sel_.estimator_.feature_importances_ > sel_.estimator_.feature_importances_.mean())))
    
    # Plot the distribution of importance
    importance_plot = pd.Series(sel_.estimator_.feature_importances_.ravel()).hist()
    
    return selected_feat, importance_plot

**Regression**

In [None]:
def randomforest_select(X_train, y_train, num_trees=100):
    """
    Specify the Random Forest instance, indicating the number of trees.
    Use the selectFromModel object from sklearn to automatically select the features
    SelectFrom model will select those features which importance is greater than the mean importance of all the features 
    by default, but you can alter this threshold if you want to.
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set   
            num_trees (int): The number of trees in the forest. The default value is 100.
        Returns:
            selected_feat (list): List of features which are selected after applying linear 
                                  coefficients feature selection technique.
            importance_plot (plot): Plot for feature importance.      
    """
    print(f'Before applying random forest feature selection technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Do the model fitting and feature selection
    sel_ = SelectFromModel(RandomForestRegressor(n_estimators=num_trees))
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    # Compare the amount selected features with the amount of features 
    # which coefficient is above the mean coefficient
    print('Total features: {}'.format((X_train.shape[1])))
    print('Selected features: {}'.format(len(selected_feat)))
    print('Features with coefficients greater than the mean coefficient: {}'.format(
        np.sum(sel_.estimator_.feature_importances_ > sel_.estimator_.feature_importances_.mean())))
    
    # Plot the distribution of importance
    importance_plot = pd.Series(sel_.estimator_.feature_importances_.ravel()).hist()
    
    return selected_feat, importance_plot

Selecting features by using tree derived feature importance is a very srtaightforward, fast and generally accurate way of selecting good features for machine learning. In particular, if you are going to build tree methods.

However, as I said, correlated features will show in a tree similar and lowered importance, compared to what their importance would be if the tree was built without correlated counterparts.

In situations like this, it is better to select features recursively, rather than altogether.

#### 3.c.1.b Recursive Feature Selection using Random Forest Importance

Random Forests assign equal or similar importance to features that are highly correlated. In addition, when features are correlated, the importance assigned is lower than the importance attributed to the feature itself, should the tree be built without the correlated counterparts.

Therefore, instead of eliminating features based on importance by brute force like we did in the previous lecture, we may get a better selection by removing one feature at a time, and recalculating the importance on each round.

This method is an hybrid between embedded and wrapper methods: it is based on computation derived when fitting the model, but it also requires fitting several models.

The cycle is as follows:

- Build random forests using all features
- Remove least important feature
- Build random forests and recalculate importance
- Repeat until a criteria is met

In this situation, when a feature that is highly correlated to another one is removed, then, the importance of the remaining feature increases. This may lead to a better subset feature space selection. On the downside, building several random forests is quite time consuming, in particular if the dataset contains a high number of features.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score

In [None]:
def randomforest_recursive(X_train, y_train, num_trees=100, n=10):
    """
    Specify the Random Forest instance, indicating the number of trees.
    Use the selectFromModel object from sklearn to automatically select the features
    SelectFrom model will select those features which importance is greater than the mean importance of all the features by default, but you can alter this threshold if you want to
        Parameters:
            X_train (DataFrame): X_train set
            y_train (DataFrame): y_train set   
            num_trees (int): The number of trees in the forest. The default value is 100.
            n (int): Number of feature selected. The default value is 10.
        Returns:
            selected_feat (list): List of features which are selected after applying random forest 
                                  selection technique. 
    """
    print(f'Before applying random forest recursive feature selection technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Do the model fitting and feature selection
    sel_ = RFE(RandomForestClassifier(n_estimators=num_trees), n_features_to_select=n)
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    return selected_feat 

Selecting features recursively may not add any advantage to the random forest algorithm. And yet it took a massive amount of time to run. Keep this in mind at the time of selecting with method you are going to use.

In experience, RFE from sklearn does not bring forward a massive advantage respect to the SelectFromModel method, and personally I tend to use the second to select my features.

### 3.c.2 Gradient Boosted Trees Importance

Feature importance calculated in the same way:

- Biased to highly cardinal features 
- Importance is susceptible to correlated features 
- Interpretability of feature importance is not so straightforward: 
- Later trees fit to the errors of the first trees, therefore feature importance is not necessarily proportional on the influence of the feature on the outcome, rather on the mistakes of the previous trees. 
- Averaging across trees may not add much information on true relation between feature and target

Similarly to selecting features using Random Forests derived feature importance, you can select features based on the importance derived by gradient boosted trees. And you can do that in one go, or in a recursive manner, depending on how much time you have, how many features are in the dataset, and whether they are correlated or not.

Same as with the random forest derived importance feature selection, the recursive procedure did not add any advantage over the altogether selection. And it took a substantial amount of time to compute.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import roc_auc_score

#### 3.c.2.a Gradient Boosted Trees Feature Importance

In [None]:
def gradientboost_select(X_train, X_test, y_train):
    """
    Select features based on the importance derived by gradient boosted trees.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
        Returns:
            selected_feat (list): List of features which are selected after applying gradient 
                                  boosted feature importance technique.                    
    """
    print(f'Data shape before applying gradient boosted trees feature selection technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Select features all together in one go by contemplating their importance 
    # after fitting only 1 gradient boosted tree
    sel_ = SelectFromModel(GradientBoostingClassifier())
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    return selected_feat

#### 3.c.2.b Recursive Feature Selection using Gradient Boosted Trees Feature Importance

In [None]:
def gradientboost_select(X_train, X_test, y_train):
    """
    Select features recursively based on the importance derived by gradient boosted trees.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
        Returns:
            selected_feat_rfe (list): List of features which are selected after applying gradient 
                                      boosted feature importance technique recursively.                       
    """
    print(f'Data shape before applying random forest feature selection technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Select features all together in one go by contemplating their importance 
    # after fitting only 1 gradient boosted tree
    sel_ = SelectFromModel(GradientBoostingClassifier())
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features
    selected_feat = X_train.columns[(sel_.get_support())]
    
    # Select features recursively for comparison
    sel_ = RFE(GradientBoostingClassifier(), n_features_to_select=len(selected_feat))
    sel_.fit(X_train.fillna(0), y_train)
    
    # Create variable for selected features after recursive selection
    selected_feat_rfe = X_train.columns[(sel_.get_support())]
    
    return selected_feat_rfe

**Run Gradient Boosting model and compare train and test set performance**

In [None]:
def run_gradientboosting(X_train, X_test, y_train, y_test):
    """
    Create a function to build gradient boosted trees and compare 
    performance in train and test set.
        Parameters:
            X_train (array): X_train after the dataset is splitted as train/test set and removed 
                             redundant features via applying gradient boosted feature 
                             importance technique.
            X_test (array):  X_test after the dataset is splitted as train/test set and removed 
                             redundant features via applying gradient boosted feature 
                             importance technique.
            y_train (array): y_train after the dataset is splitted as train and test set.
            y_test (array):  y_test after the dataset is splitted as train and test set. 
        Returns:
            none --> print train and test set model performance
    """
    rf = GradientBoostingClassifier(
        n_estimators=200, random_state=0, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(
        roc_auc_score(y_train, pred[:, 1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(
        roc_auc_score(y_test, pred[:, 1])))

In [None]:
# features selected recursively 
# does not add avantage over the altogether selection and takes substantial amount of time to compute.
run_gradientboosting(X_train[selected_feat_rfe].fillna(0),
                     X_test[selected_feat_rfe].fillna(0),
                     y_train, y_test)

In [None]:
# features selected altogether
run_gradientboosting(X_train[selected_feat].fillna(0),
                     X_test[selected_feat].fillna(0),
                     y_train, y_test)

Same as with the random forest derived importance feature selection, the recursive procedure may not add any advantage over the altogether selection. And it took a substantial amount of time to compute.

## 4. Hybrid Methods (Combination of Wrapper and Embedded Methods)

### 4.a Shuffling Features

A popular method of feature selection consists in random shuffling the values of a specific variable and determining how that permutation affects the performance metric of the machine learning algorithm. In other words, the idea is to permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model. If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics. Contrarily, non-important / non-predictive variables, should have little to no effect on the model performance metric we are assessing.

**Note:**

Random Forest model is used here, however this method is useful for any machine learning algorithm. In fact, the importance of the features are determined specifically for the algorithm used. Therefore, different algorithms may return different subsets of important features.

**Classification**

In [None]:
# For classification 
def shuffling_features_classif(X_train, X_test, y_train, y_test):
    """
    Permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, 
    or the roc_auc, or the mse of the machine learning model. 
    If the variables are important, this is, highly predictive, a random permutation of their values will decrease 
    dramatically any of these metrics. Contrarily, non-important/non-predictive variables, 
    should have little to no effect on the model performance metric we are assessing.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            feature_importance (DataFrame): A dataframe consist of all 'feature' and 'auc_drop' columns.
                                            Those features, which auc_drop is greater that 0 (feature_importance.auc_drop>0),
                                            cause a drop in the performance of the model when their values were permuted. 
                                            This means that we could select those features and discard the rest, 
                                            and should keep the original model performance.
                                            Note: if you want to have the list of important features, after runing this function,
                                                  feature_importance[feature_importance.auc_drop>0]['feature'] script will 
                                                  return it. 
    """
    print(f'Data shape before applying feature shuffling technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # The first step to determine feature importance by feature shuffling
    # is to build the machine learning model for which we want to select features
    # In this case, Random Forests model is built, but remember that 
    # you can use this procedure for any other machine learning algorithm
    # Few and shallow trees is built to avoid overfitting
    
    model = RandomForestClassifier(n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)
    
    # Train the model
    model.fit(X_train.fillna(0), y_train)
    
    # print roc-auc in train and testing sets
    print('train auc score: ',
          roc_auc_score(y_train, (rf.predict_proba(X_train.fillna(0)))[:, 1]))
    print('test auc score: ',
          roc_auc_score(y_test, (rf.predict_proba(X_test.fillna(0)))[:, 1]))

    # Shuffle one by one, each feature of the dataset and then use the dataset with 
    # the shuffled variable to make predictions using the model which is trained above

    # Overall train roc-auc: using all the features
    train_auc = roc_auc_score(y_train, (rf.predict_proba(X_train.fillna(0)))[:, 1])

    # Dictionary to capture the features and the drop in auc that they cause when shuffled
    feature_dict = {}

    # Selection  logic
    for feature in X_train.columns:
        X_train_c = X_train.copy()

        # Shuffle individual feature
        X_train_c[feature] = X_train_c[feature].sample(frac=1).reset_index(drop=True)

        # Make prediction with shuffled feature and calculate roc-auc
        shuff_auc = roc_auc_score(y_train, (rf.predict_proba(X_train_c.fillna(0)))[:, 1])

        # Save the drop in roc-auc
        feature_dict[feature] = (train_auc - shuff_auc)
        
    # Transform the dictionary into a pandas dataframe for easy manipulation
    feature_importance = pd.Series(feature_dict).reset_index()
    feature_importance.columns = ['feature', 'auc_drop']
    
    # Sort the dataframe according to the drop in performance caused by feature shuffling
    feature_importance.sort_values(by=['auc_drop'], ascending=False, inplace=True)
    
    print(f'Number of features that cause a drop in performance when shuffled: ', feature_importance[feature_importance.auc_drop>0].shape[0])
    
    return feature_importance 

**Regression**

In [None]:
# For regression
def shuffling_features_regress(X_train, X_test, y_train, y_test):
    """
    Permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, 
    or the roc_auc, or the mse of the machine learning model. 
    If the variables are important, this is, highly predictive, a random permutation of their values will decrease 
    dramatically any of these metrics. Contrarily, non-important/non-predictive variables, 
    should have little to no effect on the model performance metric we are assessing.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            feature_importance (DataFrame): A dataframe consist of all 'feature' and 'rmse_drop' columns.
                                            Those features, which rmse_drop is greater that 0 (feature_importance.rmse_drop>0),
                                            cause a drop in the performance of the model when their values were permuted. 
                                            This means that we could select those features and discard the rest, 
                                            and should keep the original model performance.
                                            Note: if you want to have the list of important features, after runing this function,
                                                  feature_importance[feature_importance.rmse_drop>0]['feature'] script will 
                                                  return it. 
    """
    print(f'Data shape before applying feature shuffling technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # The first step to determine feature importance by feature shuffling
    # is to build the machine learning model for which we want to select features
    # In this case, Random Forests model is built, but remember that 
    # you can use this procedure for any other machine learning algorithm
    # Few and shallow trees are built to avoid overfitting

    model = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=2909, n_jobs=4)
    
    # Train the model
    model.fit(X_train.fillna(0), y_train)
    
    # print rmse in train and testing sets
    print('train rmse: ',
          np.sqrt(mean_squared_error(y_train, (rf.predict(X_train.fillna(0))))))
    print('test rmse: ',
          np.sqrt(mean_squared_error(y_test, (rf.predict(X_test.fillna(0))))))

    # Shuffle one by one, each feature of the dataset and then use the dataset with 
    # the shuffled variable to make predictions using the model which is trained above

    # overall train rmse: using all the features
    train_rmse = np.sqrt(mean_squared_error(y_train, (rf.predict(X_train.fillna(0)))))

    # Dictionary to capture the features and the drop in rmse that they cause when shuffled
    feature_dict = {}

    # Selection  logic
    for feature in X_train.columns:
        X_train_c = X_train.copy()

        # Shuffle individual feature
        X_train_c[feature] = X_train_c[feature].sample(frac=1).reset_index(drop=True)

       # make prediction with shuffled feature and calculate rmse
        shuff_rmse = np.sqrt(mean_squared_error(y_train, (rf.predict(X_train_c.fillna(0)))))

        # store the drop in rmse
        feature_dict[feature] = (train_rmse - shuff_rmse)
        
    # Transform the dictionary into a pandas dataframe for easy manipulation
    feature_importance = pd.Series(feature_dict).reset_index()
    feature_importance.columns = ['feature', 'rmse_drop']
    
    # Sort the dataframe according to the drop in performance caused by feature shuffling
    feature_importance.sort_values(by=['rmse_drop'], ascending=False, inplace=True)
    
    print(f'Number of features that cause a drop in performance when shuffled: ', feature_importance[feature_importance.rmse_drop>0].shape[0])
    
    return feature_importance

**Visualize Important Features Distribution**

In [None]:
def plotfeatureimp(feature_importance):
    """
    Plot the distribution of importances for those features that are actually important.
        Parameters:
            feature_importance (DataFrame): DataFrame which is the product of shuffling_features function.
        Returns:
            plot (bar plot): Bar plot showing distribution of feature importance.
    """
    # Capture the important features in a series for visualisation
    temp = pd.Series(feature_importance[feature_importance.auc_drop>0]['auc_drop'])
    temp.index = pd.Series(feature_importance[feature_importance.auc_drop>0]['feature'])

    plot = pd.Series(temp).plot.bar(figsize=(15,6))
    
    return plot

## 4.b Hybrid Method: Recursive Feature Elimination

This method consists of the following steps:

1) Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.

2) Remove one feature -the least important- and build a machine learning algorithm utilising the remaining features.

3) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.

4) If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.

5) Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.


This is called as a hybrid method because:

- it combines the importance derived from the machine learning algorithm like embedded methods,
- and it removes as well one feature at a time, and calculates a new metric based on the new subset of features and the machine learning algorithm of choice, like wrapper methods.

The difference between this method and the step backwards feature selection lies in that it does not remove all features first in order to determine which one to remove. It removes the least important one, based on the machine learning model derived important. And then, it makes an assessment as to whether that feature should be removed or not. So it removes each feature only once during selection, whereas step backward feature selection removes all the features at each step of selection.

This method is therefore faster than wrapper methods and generally better than embedded methods. In practice it works extremely well. It does also account for correlations (depending on how stringent you set the arbitrary performance drop threshold). On the downside, the drop in performance assessed to decide whether the feature should be kept or removed, is set arbitrarily. The smaller the drop the more features will be selected, and vice versa.

**Note** For the demonstration, XGBoost model is built here, however this method is useful for any machine learning algorithm. In fact, the importance of the features are determined specifically for the algorithm used. Therefore, different algorithms may return different subsets of important features.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import roc_auc_score, r2_score

**Classification**

In [None]:
# For classification 1/2 - After this function, you should run following function 
def hybrid_recursive_eliminate_1(X_train, X_test, y_train, y_test):
    """
    Rank the features according to their importance derived from a machine learning algorithm: 
    it can be tree importance, or LASSO/Ridge, or the linear/logistic regression coefficients.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            features (Series): Features ranked according to their importance derived from a 
                               machine learning algorithm.
            plot (Plot): Plot the importance of each feature, which is ranked from the 
                         least to the most important in bar graph. 
    """
    print(f'Data shape before applying hybrid recursive feature elimination technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # The first step of this procedure  consists in building a machine learning algorithm using 
    # all the available features and then determine the importance of the features according
    # to the algorithm.

    # Set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # Build initial model using all the features
    model_all_features = xgb.XGBClassifier(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)
    model_all_features.fit(X_train, y_train)

    # Calculate the roc-auc in the test set
    y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
    auc_score_all = roc_auc_score(y_test, y_pred_test)
    print('Test all features xgb ROC AUC=%f' % (auc_score_all))
    
    # Derive the importance of each feature and rank them from the least to the most important
    features = pd.Series(model_all_features.feature_importances_)
    features.index = X_train.columns
    features.sort_values(ascending=True, inplace=True)
    
    # Plot feature importance 
    plot = features.plot.bar(figsize=(20,6))
    
    return features, plot

In [None]:
# For classification continue - 2/2
def hybrid_recursive_eliminate_2(features, threshold=0.0005):
    """
    1) Remove one feature -the least important- and build a machine learning algorithm utilising 
       the remaining features.
    2) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
    3) If the metric decreases by more of an arbitrarily set threshold, then that feature is 
       important and should be kept. Otherwise, we can remove that feature.
    4) Repeat steps 1-3 until all features have been removed (and therefore evaluated) and 
       the drop in performance assessed.
            Parameters:
                features (series): Features ranked according to their importance derived from a 
                                   machine learning algorithm. It is output of previous function.
                threshold (float): Threshold for the drop in roc-auc. The default value is 0.0005.
            Returns:
                features_to_keep (list): Important features derived from hybrid recursive feature selection. 
    """  
    # Convert ordered features into list
    features_list = list(features.index) 
    
    # Removing one at a time all the features, from the least to the most important, 
    # and build an xgboost at each round. Once build the model, calculate the new roc-auc
    # if the new roc-auc is smaller than the original one (with all the features), 
    # then that feature that was removed was important, and we should keep it.
    # Otherwise, we should remove the feature

    # recursive feature elimination:

    # Arbitrarily set the drop in roc-auc
    # if the drop is below this threshold, the feature will be removed
    tol = threshold

    print('Doing recursive feature elimination')

    # Initialise a list where collecting the features we should remove
    features_to_remove = []

    # set a counter to know how far ahead the loop is going
    count = 1

    # Loop over all the features, in order of importance:
    for feature in features_list:
        print()
        print('Testing feature: ', feature, ' which is feature ', count,
              ' out of ', len(features))
        count = count + 1

        # Initialise model
        model_int = xgb.XGBClassifier(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

        # Fit model with all variables minus the removed features and the feature to be evaluated
        model_int.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)

        # Make a prediction over the test set
        y_pred_test = model_int.predict_proba(
            X_test.drop(features_to_remove + [feature], axis=1))[:, 1]

        # Calculate the new roc-auc
        auc_score_int = roc_auc_score(y_test, y_pred_test)
        print('New Test ROC AUC={}'.format((auc_score_int)))

        # Print the original roc-auc with all the features
        print('All features Test ROC AUC={}'.format((auc_score_all)))

        # Determine the drop in the roc-auc
        diff_auc = auc_score_all - auc_score_int

        # Compare the drop in roc-auc with the tolerance which was set previously
        if diff_auc >= tol:
            print('Drop in ROC AUC={}'.format(diff_auc))
            print('keep: ', feature)
            print
        else:
            print('Drop in ROC AUC={}'.format(diff_auc))
            print('remove: ', feature)
            print
            # if the drop in the roc is small and remove the feature, 
            # need to set the new roc to the one based on the remaining features
            auc_score_all = auc_score_int

            # Append the feature to remove to the collecting list
            features_to_remove.append(feature)

    # Loop is finished, and evaluated all the features
    print('DONE!!')
    print('total features to remove: ', len(features_to_remove))

    # Determine the features to keep (those we won't remove)
    features_to_keep = [x for x in features_list if x not in features_to_remove]
    print('total features to keep: ', len(features_to_keep))

    return features_to_keep

We may not be able to get this right from the beginning though, as we did here. This method of feature selection does require that you try a few different tolerances / thresholds until you find the right number of features.

**Regression**

In [None]:
# For regression 1/2 - After this function, you should run following function 
def hybrid_recursive_eliminate_1(X_train, X_test, y_train, y_test):
    """
    Rank the features according to their importance derived from a machine learning algorithm: 
    it can be tree importance, or LASSO/Ridge, or the linear/logistic regression coefficients.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            features (Series): Features ranked according to their importance derived from a 
                               machine learning algorithm.
            plot (Plot): Plot the importance of each feature, which is ranked from the 
                         least to the most important in bar graph. 
    """
    print(f'Data shape before applying hybrid recursive feature elimination technique:\n
          X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # The first step of this procedure  consists in building a machine learning algorithm using 
    # all the available features and then determine the importance of the features according
    # to the algorithm.

    # Set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # Build initial model using all the features
    model_all_features = xgb.XGBRegressor(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)
    model_all_features.fit(X_train, y_train)

    # Calculate the r2 in the test set
    y_pred_test = model_all_features.predict(X_test)
    r2_score_all = r2_score(y_test, y_pred_test)
    print('Test all features xgb R2 = %f' % (r2_score_all))
    
    # Derive the importance of each feature and rank them from the least to the most important
    features = pd.Series(model_all_features.feature_importances_)
    features.index = X_train.columns
    features.sort_values(ascending=True, inplace=True)
    
    # Plot feature importance 
    plot = features.plot.bar(figsize=(20,6))
    
    return features, plot

In [None]:
# For regression continue - 2/2
def hybrid_recursive_eliminate_2(features, threshold=0.001):
    """
    1) Remove one feature -the least important- and build a machine learning algorithm utilising 
       the remaining features.
    2) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
    3) If the metric decreases by more of an arbitrarily set threshold, then that feature is 
       important and should be kept. Otherwise, we can remove that feature.
    4) Repeat steps 1-3 until all features have been removed (and therefore evaluated) and 
       the drop in performance assessed.
            Parameters:
                features (series): Features ranked according to their importance derived from a 
                                   machine learning algorithm. It is output of previous function.
                threshold (float): Threshold for the drop in r2. The default value is 0.001.
            Returns:
                features_to_keep (list): Important features derived from hybrid recursive feature selection. 
    """  
    # Convert ordered features into list
    features_list = list(features.index) 
    
    # Removing one at a time all the features, from the least to the most
    # important, and build an xgboost at each round.
    # Once we build the model, we calculate the new r2, if the new r2 is smaller than the original one
    # (with all the features), then that feature that was removed was important, and we should keep it.
    # Otherwise, we should remove the feature.

    # Arbitrarily set the drop in r2
    # if the drop is below this threshold, the feature will be removed
    tol = threshold

    print('Doing recursive feature elimination')

    # Initialise a list where we will collect the features we should remove
    features_to_remove = []

    # Set a counter to know how far ahead the loop is going
    count = 1

    # Loop over all the features, in order of importance:
    # Remember that features is the list of ordered features by importance
    
    for feature in features:
        print()
        print('testing feature: ', feature, ' which is feature ', count,
              ' out of ', len(features))
        count = count + 1

        # Initialise model
        model_int = xgb.XGBRegressor(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

        # Fit model with all variables minus the removed features and the feature to be evaluated
        model_int.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)

        # Make a prediction over the test set
        y_pred_test = model_int.predict(X_test.drop(features_to_remove + [feature], axis=1))

        # Calculate the new r2
        r2_score_int = r2_score(y_test, y_pred_test)
        print('New Test r2 = {}'.format((r2_score_int)))

        # Print the original r2 with all the features
        print('All features Test r2 = {}'.format((r2_score_all)))

        # Determine the drop in the r2
        diff_r2 = r2_score_all - r2_score_int

        # Compare the drop in r2 with the tolerance which is set previously
        if diff_r2 >= tol:
            print('Drop in r2 ={}'.format(diff_r2))
            print('keep: ', feature)
            print
        else:
            print('Drop in r2 = {}'.format(diff_r2))
            print('remove: ', feature)
            print
            # if the drop in the r2 is small and we remove the feature, 
            # we need to set the new r2 to the one based on the remaining features
            r2_score_all = r2_score_int

            # Append the feature to remove to the collecting list
            features_to_remove.append(feature)

    # loop is finished, all the features were evaluated.
    print('DONE!!')
    print('Total features to remove: ', len(features_to_remove))

    # Determine the features to keep (those we won't remove)
    features_to_keep = [x for x in features if x not in features_to_remove]
    print('total features to keep: ', len(features_to_keep))
    
    return features_to_keep

## 4.c Hybrid Method: Recursive Feature Addition

This method consists of the following steps:

1) Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.

2) Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.

3) Add one feature -the most important- and build a machine learning algorithm utilising the added and any feature from previous rounds.

4) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.

5) If the metric increases by more than an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.

6) Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.


This is called as a hybrid method because:

- it combines the importance derived from the machine learning algorithm like embedded methods,
- and it adds as well one feature at a time, and calculates a new metric based on the new subset of features and the machine learning algorithm of choice, like wrapper methods.

The difference between this method and the step forward feature selection lies in that it does not add all possible features first, in order to determine which one to keep. It adds the most important one, based on the machine learning model derived important. And then, it makes an assessment as to whether that feature should be kept or not. And then it moves to the next feature.

This method is therefore faster than wrapper methods and generally better than embedded methods. In practice it works extremely well. It does also account for correlations (depending on how stringent you set the arbitrary performance drop threshold). On the downside, the increase in performance assessed to decide whether the feature should be kept or removed, is set arbitrarily. The smaller the increase the more features will be selected, and vice versa.

**Note** For the demonstration, XGBoost model is built, however this method is useful for any machine learning algorithm. In fact, the importance of the features are determined specifically for the algorithm used. Therefore, different algorithms may return different subsets of important features.

In [60]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import roc_auc_score, r2_score

**Classification**

In [62]:
# For classification 1/2 - After this function, you should run following function 
def hybrid_recursive_addition_1(X_train, X_test, y_train, y_test):
    """
    Rank the features according to their importance derived from a machine learning algorithm: 
    it can be tree importance, or LASSO/Ridge, or the linear/logistic regression coefficients.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            features (Series): Features ranked according to their importance derived from a 
                               machine learning algorithm.
            plot (Plot): Plot the importance of each feature, which is ranked from the 
                         most to the least important in bar graph. 
    """
    print(f'Before applying hybrid recursive feature addition technique: X_train : {X_train.shape}, X_test: {X_test.shape}')

    # Build a machine learning algorithm using all the available features
    # and then determine the importance of the features according to the algorithm

    # set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # build initial model using all the features
    model_all_features = xgb.XGBClassifier(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

    model_all_features.fit(X_train, y_train)

    # calculate the roc-auc in the test set
    y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
    auc_score_all = roc_auc_score(y_test, y_pred_test)
    print('Test all features xgb ROC AUC=%f' % (auc_score_all))

    # Derive the importance of each feature and ranking them from the most to the least important
    # Get feature name and importance
    features = pd.Series(model_all_features.feature_importances_)
    features.index = X_train.columns

    # Sort the features by importance
    features.sort_values(ascending=False, inplace=True)

    # plot
    plot = features.plot.bar(figsize=(20,6))
    
    return features, plot

In [63]:
# For classification continue - 2/2
def hybrid_recursive_addition_2(features, threshold=0.0005):
    """
    1) Build a machine learning model with only 1 feature, the most important one, 
       and calculate the model metric for performance.
    2) Add one feature -the most important- and build a machine learning algorithm utilising 
       the added and any feature from previous rounds.
    3) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
    4) If the metric increases by more than an arbitrarily set threshold, 
       then that feature is important and should be kept. Otherwise, we can remove that feature.
    5) Repeat steps 1-4 until all features have been removed (and therefore evaluated) 
       and the drop in performance assessed.
            Parameters:
                features (series): Features ranked according to their importance derived from a 
                                   machine learning algorithm. It is output of previous function.
                threshold (float): Threshold for the drop in roc-auc. The default value is 0.0005.
            Returns:
                features_to_keep (list): Important features derived from hybrid recursive feature selection. 
    """  
    # Convert ordered features into list
    features_list = list(features.index) 
    
    # Build a machine learning algorithm using only the most important feature

    # Set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # Build initial model using all the features
    model_one_feature = xgb.XGBClassifier(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

    # Train using only the most important feature
    model_one_feature.fit(X_train[features[0]].to_frame(), y_train)

    # Calculate the roc-auc in the test set
    y_pred_test = model_one_feature.predict_proba(X_test[features[0]].to_frame())[:, 1]
    auc_score_first = roc_auc_score(y_test, y_pred_test)
    print('Test one feature xgb ROC AUC=%f' % (auc_score_first))
    
    # Adding one at a time all the features, from the most to the least important, and build an xgboost at each round.
    # Once we build the model, we calculate the new roc-auc. if the new roc-auc is bigger than the original one
    # (with one feature), then that feature that was added was important, and we should keep it.
    # Otherwise, we should remove the feature

    # recursive feature addition:

    # Arbitrarily set the increase in roc-auc
    # if the increase is above this threshold, the feature will be kept.
    tol = 0.001

    print('Doing recursive feature addition')

    # Initialise a list where we will collect the features we should keep
    features_to_keep = [features[0]]

    # Set a counter to know how far ahead the loop is going
    count = 1

    # Loop over all the features, in order of importance:
    # Remember that features is the list of ordered features by importance
    for feature in features[1:]:
        print()
        print('testing feature: ', feature, ' which is feature ', count,
              ' out of ', len(features))
        count = count + 1

        # Initialise model
        model_int = xgb.XGBClassifier(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

        # Fit model with the selected features and the feature to be evaluated
        model_int.fit(X_train[features_to_keep + [feature] ], y_train)

        # Make a prediction over the test set
        y_pred_test = model_int.predict_proba(X_test[features_to_keep + [feature] ])[:, 1]

        # Calculate the new roc-auc
        auc_score_int = roc_auc_score(y_test, y_pred_test)
        print('New Test ROC AUC={}'.format((auc_score_int)))

        # Print the original roc-auc with one feature
        print('All features Test ROC AUC={}'.format((auc_score_first)))

        # Determine the increase in the roc-auc
        diff_auc = auc_score_int - auc_score_first

        # Compare the increase in roc-auc with the tolerance which is set previously
        if diff_auc >= tol:
            print('Increase in ROC AUC={}'.format(diff_auc))
            print('keep: ', feature)
            print
            # if the increase in the roc is bigger than the threshold
            # we keep the feature and re-adjust the roc-auc to the new value
            # considering the added feature
            auc_score_first = auc_score_int

            # and we append the feature to keep to the list
            features_to_keep.append(feature)
        else:
            # we ignore the feature
            print('Increase in ROC AUC={}'.format(diff_auc))
            print('remove: ', feature)
            print

    # Loop is finished, all the features are evaluated. 
    print('DONE!!')
    print('total features to keep: ', len(features_to_keep))

    return features_to_keep

In practice you may need to run a few runs of these method and find the right threshold, depending on how many features you are willing to include in your model and how accurate you would like it to be.

**Regression**

In [64]:
# For regression 1/2 - After this function, you should run following function 
def hybrid_recursive_addition_1(X_train, X_test, y_train, y_test):
    """
    Rank the features according to their importance derived from a machine learning algorithm: 
    it can be tree importance, or LASSO/Ridge, or the linear/logistic regression coefficients.
        Parameters:
            X_train (DataFrame): X_train set
            X_test (DataFrame): X_test set
            y_train (DataFrame): y_train set 
            y_test (DataFrame): y_test set
        Returns:
            features (Series): Features ranked according to their importance derived from a 
                               machine learning algorithm.
            plot (Plot): Plot the importance of each feature, which is ranked from the 
                         most to the least important in bar graph. 
    """
    print(f'Before applying hybrid recursive feature addition technique: X_train : {X_train.shape}, X_test: {X_test.shape}')

    # Build a machine learning algorithm using all the available features
    # and then determine the importance of the features according to the algorithm

    # set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # build initial model using all the features
    model_all_features = xgb.XGBRegressor(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

    model_all_features.fit(X_train, y_train)

    # calculate the r2 in the test set
    y_pred_test = model_all_features.predict(X_test)
    r2_score_all = r2_score(y_test, y_pred_test)
    print('Test all features xgb R2 = %f' % (r2_score_all))

    # Derive the importance of each feature and ranking them from the most to the least important
    # Get feature name and importance
    features = pd.Series(model_all_features.feature_importances_)
    features.index = X_train.columns

    # Sort the features by importance
    features.sort_values(ascending=False, inplace=True)

    # plot
    plot = features.plot.bar(figsize=(20,6))
    
    return features, plot

In [66]:
# For regression continue - 2/2
def hybrid_recursive_addition_2(features, threshold=0.001):
    """
    1) Build a machine learning model with only 1 feature, the most important one, 
       and calculate the model metric for performance.
    2) Add one feature -the most important- and build a machine learning algorithm utilising 
       the added and any feature from previous rounds.
    3) Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
    4) If the metric increases by more than an arbitrarily set threshold, 
       then that feature is important and should be kept. Otherwise, we can remove that feature.
    5) Repeat steps 1-4 until all features have been removed (and therefore evaluated) 
       and the drop in performance assessed.
            Parameters:
                features (series): Features ranked according to their importance derived from a 
                                   machine learning algorithm. It is output of previous function.
                threshold (float): Threshold for the drop in roc-auc. The default value is 0.001.
            Returns:
                features_to_keep (list): Important features derived from hybrid recursive feature selection. 
    """  
    # Convert ordered features into list
    features_list = list(features.index) 
    
    # Build a machine learning algorithm using only the most important feature

    # Set the seed for reproducibility
    seed_val = 1000000000
    np.random.seed(seed_val)

    # Build initial model using all the features
    model_one_feature = xgb.XGBRegressor(
        nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

    # Train using only the most important feature
    model_one_feature.fit(X_train[features[0]].to_frame(), y_train)

    # Calculate the roc-auc in the test set
    y_pred_test = model_one_feature.predict(X_test[features[0]].to_frame())
    r2_score_first = r2_score(y_test, y_pred_test)
    print('Test one feature xgb R2=%f' % (r2_score_first))
    
    # Adding one at a time all the features, from the most to the least important, and build an xgboost at each round.
    # Once we build the model, we calculate the new r2
    # if the new r2 is bigger than the original one (with one feature), 
    # then that feature that was added was important, and we should keep it.
    # Otherwise, we should remove the feature.

    # recursive feature addition:

    # Arbitrarily set the increase in r2
    # if the increase is above this threshold, the feature will be kept
    tol = threshold

    print('doing recursive feature addition')

    # Initialise a list where we will collect the features we should keep
    features_to_keep = [features[0]]

    # Set a counter to know how far ahead the loop is going
    count = 1

    # Loop over all the features, in order of importance:
    # Remember that features is the list of ordered features by importance
    for feature in features[1:]:
        print()
        print('testing feature: ', feature, ' which is feature ', count,
              ' out of ', len(features))
        count = count + 1

        # Initialise model
        model_int = xgb.XGBRegressor(nthread=10, max_depth=4, n_estimators=500, learning_rate=0.05)

        # Fit model with the selected features and the feature to be evaluated
        model_int.fit(X_train[features_to_keep + [feature] ], y_train)

        # Make a prediction over the test set
        y_pred_test = model_int.predict(X_test[features_to_keep + [feature] ])

        # Calculate the new r2
        r2_score_int = r2_score(y_test, y_pred_test)
        print('New Test R2={}'.format((r2_score_int)))

        # print the original r2 with all the features
        print('All features Test R2={}'.format((r2_score_first)))

        # Determine the drop in the roc-auc
        diff_r2 = r2_score_int - r2_score_first

        # Compare the increase in r2 with the tolerance which was set previously
        if diff_r2 >= tol:
            print('Increase in r2 = {}'.format(diff_r2))
            print('keep: ', feature)
            print
            # If the increase in the r2 is bigger than the threshold
            # we keep the feature and re-adjust the r2 to the new value considering the added feature
            auc_score_first = auc_score_int

            # Append the feature to keep to the list
            features_to_keep.append(feature)
        else:
            # Ignore the feature
            print('Increase in r2 = {}'.format(diff_r2))
            print('remove: ', feature)
            print

    # loop is finished, all the features were evaluated.
    print('DONE!!')
    print('total features to keep: ', len(features_to_keep))
    
    return features_to_keep

if we increase the threshold here, we should be able to reduce the number of features a bit further and potentially increase the performance of our model.

# 5. (Additional) Combining Feature Selectors

In [67]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

**Classification**

In [None]:
def combine_selectors(df, target_var, size=0.3, n=2):
    """
    Combine diffrent set of algorithms's vote for feature importance and select the features 
    which have more votes than defined threshold.
        Parameters:
            df (DataFrame): Any dataframe
            target_var (str): Target variable of the dataset.
            size (float): Test size in train/test split. The default value is 0.3. 
            n (int): if we want to make sure we don't lose any information,
                     we could select all features with at least one vote.
                     For default, we chose to have at least two models voting for
                     a feature in order to keep it. 
        Returns:
            reduced_X (DataFrame): New dataframe after combining selectors and removing redundant features. 
            selected_feat (list): The list of selected features. 
    """
    # separate dataset into train and test
    X = df.drop(labels=[target_var], axis=1)
    y = df[target_var]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= size, random_state=0)
    
    # Data Shape
    print(f'Before applying feature selection: X_train : {X_train.shape}, X_test: {X_test.shape}')
    
    # Feature selection with LassoCV 
    # The LassoCV class will use cross validation to try out different alpha 
    # settings and select the best one. 
    lcv = LassoCV()
    lcv.fit(X_train, y_train)
    lcv.score(X_test, y_test)
    lcv_mask = lcv.coef_ != 0
    features_to_keep = sum(lcv_mask) 

    # Feature selection with random forest
    rfe_rf = RFE(estimator=RandomForestRegressor(), n_features_to_select=features_to_keep, step=5, verbose=1)
    rfe_rf.fit(X_train, y_train)
    rf_mask = rfe_rf.support_

    # Feature selection with gradient boosting
    rfe_gb = RFE(estimator=GradientBoostingRegressor(),
    n_features_to_select=features_to_keep, step=5, verbose=1)
    rfe_gb.fit(X_train, y_train)
    gb_mask = rfe_gb.support_

    # Combining the feature selectors
    votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

    # Masking by voting threshold
    mask = votes >= n  
    
    X = df.
    reduced_X = X.loc[:, mask]
    
    selected_feat = reduced_X.columns.tolist()
    
    return reduced_X, selected_feat

In [None]:
# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print('The model can explain {0:.1%} of the variance in the test set using {1:} features.'.format(r_squared, len(lm.coef_)))

# Dimensionality Reduction 

## Feature Extraction - Principal Component Analysis (PCA) 

Calculating new features based on the existing ones while trying to lose as little information as possible. It creates news features, which are in fact combinations of the original ones.

PCA:

For this technique, it is important to scale the features first, so that their values are easier to compare.

We can add a reference point to the very center of the point cloud, and then point a vector in the direction of this strongest pattern. We can add a second vector perpendicular to the first one to account for the rest of the variance in this dataset.

Every point in the dataset could be described by multiplying and then summing two perpendicular vectors. We essentially created a new reference system aligned with the variance in the data. The coordinates that each point has in this new reference system are called principal components, and they are the foundation of principal component analysis (PCA).

Principals share no duplicate information and that they are ranked from most to least important.

**Principal Component Analysis**

In [None]:
# Calculating the principal components

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
std_df = scaler.fit_transform(df)

from sklearn.decomposition import PCA
pca = PCA()
print(pca.fit_transform(std_df))

# Principal component explained variance ratio
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(std_df)
print(pca.explained_variance_ratio_)

# PCA for dimensionality reduction
pca = PCA()
pca.fit(ansur_std_df)
print(pca.explained_variance_ratio_.cumsum())

# Understanding the components
print(pca.components_)

In [None]:
# Create a pairplot to inspect ansur_df
sns.pairplot(ansur_df)

plt.show()

from sklearn.preprocessing import StandardScaler

# Create the scaler and standardize the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Create the PCA instance and fit and transform the data with pca
pca = PCA()
pc = pca.fit_transform(ansur_std)

# This changes the numpy array output back to a dataframe
pc_df = pd.DataFrame(pc, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4'])

# Create a pairplot of the principal component dataframe
sns.pairplot(pc_df)
plt.show()

In [None]:
# PCA on a larger dataset

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Scale the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Apply PCA
pca = PCA()
pca.fit(ansur_std)

# Inspect the explained variance ratio per component
print(pca.explained_variance_ratio_)

# Print the cumulative sum of the explained variance ratio
print(pca.explained_variance_ratio_.cumsum())

**PCA Applications**

In [None]:
# Understanding the components
print(pca.components_)

# PCA in a pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA())])
pc = pipe.fit_transform(ansur_df)
print(pc[:,:2])

# Checking the effect of categorical features
ansur_categories['PC 1'] = pc[:,0]
ansur_categories['PC 2'] = pc[:,1]
sns.scatterplot(data=ansur_categories, x='PC 1', 
                y='PC 2',hue='Height_class', alpha=0.4)

# PCA in a model pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=3)),
    ('classifier', RandomForestClassifier())])
pipe.fit(X_train, y_train)
print(pipe.steps[1])

# PCA in a model pipeline
pipe.steps[1][1].explained_variance_ratio_.cumsum()
print(pipe.score(X_test, y_test))

In [None]:
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=2))])

# Fit it to the dataset and extract the component vectors
pipe.fit(poke_df)
vectors = pipe.steps[1][1].components_.round(2)

# Print feature effects
print('PC 1 effects = ' + str(dict(zip(poke_df.columns, vectors[0]))))
print('PC 2 effects = ' + str(dict(zip(poke_df.columns, vectors[1]))))

# PCA for feature exploration
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=2))])

# Fit the pipeline to poke_df and transform the data
pc = pipe.fit_transform(poke_df)

print(pc)

# Add the 2 components to poke_cat_df
poke_cat_df['PC 1'] = pc[:, 0]
poke_cat_df['PC 2'] = pc[:, 1]

print(poke_cat_df.head())

# Use the Type feature to color the PC 1 vs PC 2 scatterplot
sns.scatterplot(data=poke_cat_df, 
                x='PC 1', y='PC 2', hue='Type')
plt.show()

### PCA in a model pipeline

# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=2)),
        ('classifier', RandomForestClassifier(random_state=0))])

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Prints the explained variance ratio
print(pipe.steps[1][1].explained_variance_ratio_)

# Score the accuracy on the test set
accuracy = pipe.score(X_test, y_test)

# Prints the model accuracy
print('{0:.1%} test set accuracy'.format(accuracy))

**Principal Component Selection**

In [None]:
# Setting an explained variance threshold

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=0.9))])

# Fit the pipe to the data
pipe.fit(poke_df)
print(len(pipe.steps[1][1].components_))

# An optimal number of components --- > 'Elbow' in the plot
pipe.fit(poke_df)
var = pipe.steps[1][1].explained_variance_ratio_
plt.plot(var)
plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

# Compressing images
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('reducer', PCA(n_components=290))])
pipe.fit(X_train)
pc = pipe.fit_transform(X_test)
print(pc.shape)

# Rebuilding images
pc = pipe.transform(X_test)
print(pc.shape)

X_rebuilt = pipe.inverse_transform(pc)
print(X_rebuilt.shape)

img_plotter(X_rebuilt)

In [None]:
# Selecting the proportion of variance to keep
# Pipe a scaler to PCA selecting 80% of the variance
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=0.8))])

# Fit the pipe to the data
pipe.fit(ansur_df)

print('{} components selected'.format(len(pipe.steps[1][1].components_)))


## Choosing the number of components ##

# Pipeline a scaler and pca selecting 10 components
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=10))])

# Fit the pipe to the data
pipe.fit(ansur_df)

# Plot the explained variance ratio
plt.plot(pipe.steps[1][1].explained_variance_ratio_)

plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

## PCA for image compression ## 

# Plot the MNIST sample
plot_digits(X_test)


# Transform the input data to principal components
pc = pipe.transform(X_test)

# Prints the number of features per dataset
print("X_test has {} features".format(X_test.shape[1]))
print("pc has {} features".format(pc.shape[1]))

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Prints the number of features
print("X_rebuilt has {} features".format(X_rebuilt.shape[1]))


# Transform the input data to principal components
pc = pipe.transform(X_test)

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Plot the reconstructed data
plot_digits(X_rebuilt)