In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.feature_selection import f_regression
from matplotlib import pyplot
from sklearn.feature_selection import SequentialFeatureSelector,SelectFromModel

### PCA alone

In [2]:
# PCA for reducing dimension
# Return X_train and X_test (dataframes):
def pca_red (X_train, y_train, X_test, k_features, n_comp, verbose):

    pca = PCA(n_components=n_comp)
    pca.fit(X_train)
    tve=0
    if verbose:
        for i, ve in enumerate(pca.explained_variance_ratio_):
            tve+=ve
            print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve) )
        print()
        print("Actual Eigenvalues:", pca.singular_values_)
        for i,comp in enumerate(pca.components_):
            print("PC",i, "-->", comp)
            
    nX_train=pca.transform(X_train)
    nX_test=pca.transform(X_test)
    
    # create a pandas DataFrame from the nX_train numpy array
    df_train = pd.DataFrame(nX_train, columns=[f"PC{i}" for i in range(nX_train.shape[1])])

    # create a pandas DataFrame from the nX_test numpy array
    df_test = pd.DataFrame(nX_test, columns=[f"PC{i}" for i in range(nX_test.shape[1])])
    
    #if verbose:
    #    print(f"PCA df_train.shape: {df_train.shape}")
    #    print(f"PCA df_train.columns: {df_train.columns}")

    # get the top x features for each principal component
    top_features = []
    
    if verbose:
        print(f"n_comp {n_comp} k_features {k_features}")

    for i in range(n_comp):
        # if verbose:
            #print(f"i {i} to n_comp {n_comp}")
            #1D array w/ weights of each feature in principal component i
            #print(f"pca.components_[i] {pca.components_[i]}")
            #indices that would sort the weights in ascending order
            #print(f"np.argsort(pca.components_[i]) {np.argsort(pca.components_[i])}")
            # list of the column names in the training DataFrame df_train.
            #print(f"X_train.columns {X_train.columns}") #no df_train
            
        top_features.append((i,
            X_train.columns[np.argsort(pca.components_[i])[::-1][:k_features]] #no df_train
            ))
    if verbose:
        #print (f"top_features all PCA {top_features}")
        print (f"top_features k_features= {k_features} for  PCA={n_comp-1} {top_features[n_comp-1]}") #n_comp = 20 is 0-19 so...

    return df_train, df_test    

### Feature selection

In [3]:
# feature selection with SelectKBest
# return X_train, X_test
def select_features(X_train, y_train, X_test, k_param, n_comp, verbose):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k=k_param)
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    
    # Get the names of the selected features
    mask = fs.get_support()
    feature_names = X_train.columns
    selected_feature_names = feature_names[mask]
    
    X_train_fs = pd.DataFrame (X_train_fs)
    X_test_fs = pd.DataFrame (X_test_fs)

    X_train_fs.columns = selected_feature_names
    X_test_fs.columns = selected_feature_names
    
    if verbose:
        print(f"select_features X_train_fs.columns: {X_train_fs.columns}")

    feature_results =[]
    for i, column in enumerate(X_train_fs.columns):
        if verbose:
            print (f"Feature i: {i} column: {column} fs.scores_[i] {fs.scores_[i]}")
        feature_results.append((i,
                    column,
                    fs.scores_[i],
                    ))

    #Sort List
    feature_results_sorted = sorted(feature_results, key=lambda x: x[2], reverse=True) #score

    final_feature_results = pd.DataFrame(feature_results_sorted)
    final_feature_results.columns = ["i", "column", "score"]

    print_full(final_feature_results,500) # "500 best features

    # plot the scores
    pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
    pyplot.show()
        
    return X_train_fs, X_test_fs #, fs




# Print all the rows
def print_full (dataset, n_rows):
        # set display option to show all rows
        pd.set_option('display.max_rows', None)
        # set display option to show all columns
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)

        print (f"dataset \n {dataset.head(n_rows)}")
        
        # reset display option to default
        pd.reset_option('display.max_rows')
        pd.reset_option('display.max_columns')
        pd.reset_option('display.width')

In [5]:
#Random forest feature selection

def rf_feature_selection (X_train, y_train, X_test, k_features,n_comp, verbose):
    
    N,M=X_train.shape

    rfr=RandomForestRegressor(random_state=0)
    # select the top k_features features
    sel = SelectFromModel(estimator=rfr, max_features=k_features,threshold=-np.inf) 
    #sel = SelectFromModel(estimator=rfr, threshold=.05) # threshold defined to have 5 features
    sel.fit(X_train, y_train)

    if verbose:
        print("Importances: ", sel.estimator_.feature_importances_)
        print("Default threshold: ", sel.threshold_)

    features=sel.get_support()
    Features_selected =np.arange(M)[features]
    

    nX_train=sel.transform(X_train)
    nX_test=sel.transform(X_test)

    # create a pandas DataFrame from the nX_train numpy array
    df_train = pd.DataFrame(nX_train, columns=X_train.columns[Features_selected])
    # create a pandas DataFrame from the nX_test numpy array
    df_test = pd.DataFrame(nX_test, columns=X_train.columns[Features_selected])

    if verbose:
        print(f"rf_feature_selection df_train.columns: {df_train.columns}")

    return df_train, df_test

In [6]:
# Stepwise for feature selection
# Return X_train and X_test (dataframes)
def stepwise_feature_selection (X_train, y_train, model_regressor_w_params, k_features,direction_param, verbose):

    #using ML model regression for sequential feature selection
    #lmr=DecisionTreeRegressor(max_depth = 10)
    if verbose:
        print(f"in stepwise_feature_selection w/ regressor selector {model_regressor_w_params} in direction {direction_param}")

    regressor=model_regressor_w_params

    print ("before SequentialFeatureSelector")
    sfs = SequentialFeatureSelector(regressor, n_features_to_select=k_features,direction= direction_param) #direction='backward'
    print ("after SequentialFeatureSelector")
    sfs.fit(X_train, y_train)
    print ("after fit")

    #get the relevant columns
    features=sfs.get_support()
    Features_selected =np.arange(M)[features]
    if verbose:
        print(f"The features selected k= {k_features} are columns: {Features_selected} with regressor selector {model_regressor_w_params} in direction {direction_param}")

    print ("before trasform X_train")
    nX_train=sfs.transform(X_train)
    print ("after trasform X_train")
    nX_test=sfs.transform(X_test)
    print ("after trasform X_test")
    
    # create a pandas DataFrame from the nX_train numpy array
    df_train = pd.DataFrame(nX_train, columns=X_train.columns[Features_selected])

    # create a pandas DataFrame from the nX_test numpy array
    df_test = pd.DataFrame(nX_test, columns=X_train.columns[Features_selected])

    if verbose:
        print(f"stepwise_feature_selection df_train.columns: {df_train.columns}")

    return df_train, df_test