In [2]:
import numpy as np 
import math
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy.stats as st

In [3]:
def tune_bb(algo, X, y, regularization="Dropout", M=10, c=None, K=5, criterion="MSE"):

    """function to automatically tune blackbox regression model

    Parameters:
    -----------

    algo : callable
        A learning algorithm that takes as input a matrix X in R nxp
        and a vector of responses Y in Rn and returns a function that
        maps inputs to outputs. Must have methods like .fit() and .predict()
    X : array-like of shape (n,p)
        training data X in R nxp
    y : array-like of shape (n,)
        training labels, Y, in Rn
    regularization : str, default="Dropout"
        regularization method, can be any of "Dropout",
        "NoiseAddition", or "Robust"
    M : int, default=
        A positive integer indicating the number of Monte Carlo
        replicates to be used if the method specified is Dropout or 
        NoiseAddition
    c : default=None
        A vector of column bounds to be used if method specified is "Robust"
    K : int, default=5
        A positive integer indicating the number of CV-folds to be used to 
        tune the amount of regularization, e.g., K = 5 indicates five-fold CV
    criterion : str, default="MSE"
        A criterion to be used to evaluate the method that belongs to the set 
        {MSE, MAD} where MSE encodes mean square error and MAD encodes mean
        absolute deviation.

    Returns:
    -----------
    tuned_model : callable 
        A tuned predictive model that optimizes the specific criterion using 
        the specified method
    """
    
    # statements here to ensure model has the methods we need to tune it
    assert hasattr(algo, "fit"), "model object must have .fit() method"
    assert hasattr(algo, "predict"), "model object must have .predict() method"

    if criterion == "MSE":
        criterion = "neg_mean_squared_error"
    elif criterion == "MAE":
        criterion = "neg_mean_absolute_error"
    else:
        raise ValueError("Please input either MAE or MSE for criterion.")
    
    # Standardize X (useful for all methods with regularization)
    scaler = StandardScaler()  
    X = scaler.fit_transform(X)
    
    if regularization == "Dropout":
        
        # parameter to tune
        phi_range = np.linspace(0,1,100)
        min_metric = None
        best_phi = None
        for phi in phi_range:  
            metric = []
            # CV over K Folds
            for m in range(M):
                dropout_matrix = np.random.binomial(1,phi,size=X.shape) * X
                kf = KFold(n_splits=K)
                for train_index, test_index in kf.split(dropout_matrix):
                        X_train, y_train = dropout_matrix[train_index], y[train_index]
                        X_test, y_test = dropout_matrix[test_index], y[test_index]
                        model = algo
                        model.fit(X_train, y_train)
                        if criterion == "neg_mean_squared_error":
                            metric.append(mean_squared_error(y_test, model.predict(X_test)))
                        else:
                            metric.append(mean_absolute_error(y_test, model.predict(X_test)))
                            
            new_metric = np.mean(metric)
            if (min_metric == None or new_metric < min_metric):
                best_phi = phi
                min_metric = new_metric
        
        # make a dropout matrix with the best choice of phi 
        dropout_matrix = np.random.binomial(1,best_phi,size=X.shape) * X
        model = algo
        tuned_model = model.fit(dropout_matrix, y)

    elif regularization == "NoiseAddition":
        # possible levels of noise
        lambda_levels = np.linspace(0, 5, 100)
        min_metric = None
        best_lambda = None
        for lam in lambda_levels: 
            #CV
            metric = []
            for m in range(M):
                # generate noise matrix with lambda (variance, not std)
                Z = np.random.normal(0, lam**2, size=X.shape)
                kf = KFold(n_splits=K)
                for train_index, test_index in kf.split(X):
                    X_train, y_train = X[train_index], y[train_index]
                    X_test, y_test = X[test_index], y[test_index]
                    X_train_disturbed = X_train + Z[train_index]
                    model = algo
                    model.fit(X_train_disturbed, y_train)
                    if criterion == "neg_mean_squared_error":
                        metric.append(mean_squared_error(y_test, model.predict(X_test)))
                    else:
                        metric.append(mean_absolute_error(y_test, model.predict(X_test)))
            new_metric = np.mean(metric)
            if (min_metric == None or new_metric < min_metric):
                best_lambda = lam
                min_metric = new_metric
        
        Z = np.random.normal(0, best_lambda**2, size=X.shape)
        new_X = X + Z
        model = algo
        tuned_model = model.fit(new_X, y)
        
    elif regularization == "Robust":
        #Assertion: we don't actually have any reasonable bounds to argue for robust regression here
        #We could take a sample of many matrix M and then choose 1 based on a criteria to fit our regression
        #If we do this iteratively with changing our c bounds, we can theoretically narrow down our bounds wrt the MSE/MAE
        tol = False #Are we in our tolerance range
        toler = 5e-4 #Sklearn default for tolerance is 1e-4
        wts = [x/2 for x in c] #Initial weights are going to be c/2
        oerror = np.inf #Initial error to not get in the way
        merror = np.inf #Initialize new error so object exists
        itera = 0 #We want to be able to count the number of iterations (since the weights scale by e^-itera
        while not tol:
            #print(itera, merror, wts) #Print lines I was using to track what's going on
            #Create a bunch of matrices and choose the best by some score
            maxmatrix = None
            maxnorm = -np.inf
            for i in range(1000):
                matrix = np.random.rand(X.shape[0], X.shape[1])
                for m in range(matrix.shape[1]):
                    matrix[:,m] = (wts[m] / np.linalg.norm(matrix[:,m], 2)) * matrix[:,m]
                fnorm = np.linalg.norm(matrix, 2) #The criteria I'm using here is the two-norm
                if fnorm > maxnorm:
                    maxnorm = fnorm
                    maxmatrix = matrix
            new_X = X + maxmatrix #We add the permuted matrix to our design matrix
            
            #kfold cross validation
            errors = np.abs(cross_val_score(algo, new_X, y, cv=K, scoring=criterion))
            merror = np.mean(errors)
            #I originally thought about this being only if the error was decreasing
            #The problem with that is that we're dependent a random subset of generated matrix or we could get stuck in a lucky minimum
            #Hence I think it's better to allow converence to a step to step change, even if its not at our minimum MSE
            if abs(oerror - merror) > toler:
                #print(abs(oerror - merror)) #See if we're converging correctly
                oerror = merror #Save the current error for the next iteration
                #Set our new weights for the next iteration
                wts = np.minimum(wts + (np.random.normal(size = len(wts)) * math.exp(-itera/2)), c)
                wts = np.maximum(0.1, wts) #weights can't be negative
                itera += 1
            else:
                tol = True
        
        #Once we have our best c bounds, let's use them exactly to construct the best model
        maxmatrix = None
        maxnorm = -np.inf
        for i in range(10000):
            matrix = np.random.rand(X.shape[0], X.shape[1])
            for m in range(matrix.shape[1]):
                matrix[:,m] = (wts[m] / np.linalg.norm(matrix[:,m], 2)) * matrix[:,m]
            fnorm = np.linalg.norm(matrix, 2) #The criteria I'm using here is the two-norm
            if fnorm > maxnorm:
                maxnorm = fnorm
                maxmatrix = matrix
        new_X = X + maxmatrix #We add the permuted matrix to our design matrix
        
        model = algo
        tuned_model = model.fit(new_X, y)
    else: 
        raise ValueError('Please input one of of "Dropout", "NoiseAddition", or "Robust"')

    return tuned_model


### Function Demonstration

Below we fit our function on the iris data and show that the three regularization methods give identical answers for the coefficients. 

In [4]:
# use this to view doc string
?tune_bb

In [5]:
# getting full iris data set to train our model
X, y = datasets.load_iris(return_X_y=True)

In [19]:

Robust_Ridge = tune_bb(Ridge(),
                       X,
                       y,
                       regularization="Robust",
                       c = [4,4,6,3],
                       criterion="MSE")

Dropout_Ridge = tune_bb(Ridge(),
                        X,
                        y,
                        regularization="Dropout",
                        criterion="MSE")

Noise_Ridge = tune_bb(Ridge(),
                      X,
                      y,
                      regularization="NoiseAddition",
                      criterion="MSE")

rr = Robust_Ridge.fit(X,y)
dr = Dropout_Ridge.fit(X,y)
nr = Noise_Ridge.fit(X,y)

print("Robust_Ridge Regression Coefficients : ", rr.coef_)
print("Dropout_Ridge Regression Coefficients : ", dr.coef_)
print("Noise_Ridge Regression Coefficients : ", nr.coef_)

Robust_Ridge Regression Coefficients :  [-0.11346491 -0.03184254  0.25936799  0.53764103]
Dropout_Ridge Regression Coefficients :  [-0.11346491 -0.03184254  0.25936799  0.53764103]
Noise_Ridge Regression Coefficients :  [-0.11346491 -0.03184254  0.25936799  0.53764103]


### Resources and Notes: 

1. https://www.statology.org/k-fold-cross-validation-in-python/