In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import glob
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy import stats
import sys
import scipy.stats.distributions as dist
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.gaussian_process.kernels import DotProduct
from sklearn import preprocessing
import sklearn
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
import sklearn
from sklearn.preprocessing import PolynomialFeatures

## Start Dimensionality Reduction Related Code

## Load Data Sets & Perform Dimensionality Reduction

In [2]:
""" Give the path to the training input files as the function argument, the function then 
loads the initial training and test data set"""
def load_data(path,guidance):
    train_data = pd.read_csv(path, index_col=0)
    # Change all the Paths in this function depending upon the dimensionality {50D=200,100D=400,200D=800}
    if (guidance == 2):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_2_200Samples.csv"
    elif (guidance == 3):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_3_200Samples.csv"
    elif (guidance == 7):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_7_200Samples.csv"
    elif (guidance == 9):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_9_200Samples.csv"
    elif (guidance == 10):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_10_200Samples.csv"
    elif (guidance == 13):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_13_200Samples.csv"
    elif (guidance == 15):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_15_200Samples.csv"
    elif (guidance == 16):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_16_200Samples.csv"
    elif (guidance == 20):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_20_200Samples.csv"
    elif (guidance == 24):
        path_test = "Data Generation\\50 D\\Test_Data_Sets\\test_24_200Samples.csv"
    test_data = pd.read_csv(path_test, index_col=0)
    data = pd.concat([ train_data, test_data ], ignore_index = True) 
    if ((data.iloc[:1000,:] == train_data).sum().sum() == train_data.shape[0] * train_data.shape[1] ):
        if ((data.iloc[1000:,:].reset_index().iloc[:,1:] == test_data).sum().sum() == test_data.shape[0] * test_data.shape[1]):
            print ('Successfully Merged!!!')
            return data
    else:
        return 'Error'

""" Performs Linear Transofrmation on the Data Set according to the paper
Don't change anything in this function """
def linear_transformation(path, guidance):
    data = load_data(path, guidance)
    cols = data.columns[:-1]
    # Sort all the Rows based on the value of Y = f(x) 
    # make sure that the rows with the minimum value of Y have lower weight
    data = data.sort_values(by = ['Y'], ascending = True).reset_index()
    # make sure tha the data set has been sorted/ranked in this order correctly
    if (np.sum([ data ['Y'][i] < data ['Y'][i+1] for i in range(0, len(data)-1) ])  == len(data)-1):
        print ('All Clear, Data Set Sorted in Ascending order !!!')
        # Compute the pre weights (unnormalized) 
        pre_weights = [ np.log(len(data)) - np.log(i) for i in range(1, len(data)+1) ]
        # Compute normalized weights 
        weights = np.diag ([ pre_weights[i] / np.sum(pre_weights) for i in range(len(pre_weights)) ])
        X = data.iloc[:,1:-1]
        # remove the sample mean from the data set
        X_scaled = preprocessing.scale(X)
        # rescale all the features based on the paper
        X_rescaled =  pd.DataFrame(np.matmul(X_scaled.T, weights).T , columns=cols)
        # decompose into the train and test data set again
        X_train = X_rescaled.iloc[data.loc[data['index'] < 1000, :].index, :].reset_index().iloc[:,1:]
        X_test = X_rescaled.iloc[data.loc[data['index'] >= 1000, :].index, :].reset_index().iloc[:,1:]
        return X_train, X_test
    else:
        return 'Error'

""" This is the method that implements the dimensionality reduction based on PCA """
def perform_dimensionality_reduction (path, guidance):
    #  Change the value in the line after this based on dimensionality reduction {0.3=30 %,0.7=70 %,0.9=90 %}
    n_components = int(50-0.3 * 50) # 30 % Dimensionality Reduction
    train_data, test_data = linear_transformation(path, guidance)
    scalar = MinMaxScaler().fit(train_data)
    train_data = pd.DataFrame(scalar.transform (train_data)) 
    test_data = pd.DataFrame(scalar.transform (test_data)) 
    pca = PCA(n_components).fit(train_data.values)
    train_data = pd.DataFrame(pca.transform(train_data.values))
    test_data = pd.DataFrame(pca.transform(test_data.values))
    cols = []
    for i in range(train_data.shape[1]):
        cols.append(str('Z'+str(i+1)))
    train_data.columns = cols
    test_data.columns = cols
    #  Change the Values below depending on dimensionality and % reduction
    train_data.to_csv('PCA_ELN_50D_30%_latent_training_f'+path.split('\\')[-1].split('_')[1][:2]+'.csv') 
    test_data.to_csv('PCA_ELN_50D_30%_latent_test_f'+path.split('\\')[-1].split('_')[1][:2]+'.csv') 

## End Dimensionality Reduction Related Code

## Load New Reduced Data Sets for all Test Cases

In [3]:
def load_f2(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f2.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f2.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    #  Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_2_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f3(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f3.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f3.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # hange the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_3_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f7(path):
    #  Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f7.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f7.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    #  Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_7_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f9(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f9.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f9.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_9_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f10(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f10.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f10.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-43]+str('Test_Data_Sets\\test_10_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f13(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f13.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f13.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-43]+str('Test_Data_Sets\\test_13_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f15(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f15.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f15.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-43]+str('Test_Data_Sets\\test_15_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f16(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f16.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f16.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-43]+str('Test_Data_Sets\\test_16_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f20(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f20.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f20.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    #  Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_20_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

def load_f24(path):
    # Change the Values below depending on dimensionality and % reduction
    path_latent_train = "PCA_ELN_50D_30%_latent_training_f24.csv"
    path_latent_test =  "PCA_ELN_50D_30%_latent_test_f24.csv"
    train_y = pd.read_csv(path).iloc[:,-1]
    # Change the Path Here depending upon the dimensionality {50D=200,100D=400,200D=800}
    test_y = pd.read_csv(path[:-42]+str('Test_Data_Sets\\test_24_200Samples.csv')).iloc[:,-1]
    train = pd.read_csv(path_latent_train, index_col = 0)
    test = pd.read_csv(path_latent_test, index_col = 0)
    train ['Y'] = train_y
    test ['Y'] = test_y
    del train_y
    del test_y
    true = np.array(test['Y'])
    return train,test,true

## Start Polynomial Surrogate Modelling Code

In [4]:
''' Elastic Net Regression '''
def elastic_net(train_data,test_data,hyper):
    alp, rat = hyper
    scaler =  MinMaxScaler().fit(np.r_[train_data.iloc[:,:-1].values])
    regr = ElasticNet(alpha= np.power(10,alp) ,random_state=0 , l1_ratio=rat, fit_intercept =True, max_iter=3000,selection='random').fit(scaler.transform ( np.array(train_data.iloc[:,:-1])) ,  np.array(train_data.iloc[:,-1]))
    pred = regr.predict(scaler.transform(test_data))
    return regr,pred

""" Generating Polynomial Features i.e., Function Basis """
def quadratic_polynomial (df):
    return pd.DataFrame(PolynomialFeatures(degree=2).fit_transform(df))

""" Quadratic Regression with Elastic Net Penalty"""
def polynomial(tr, te,hyper):
    f_original = tr['Y']
    temp1 = quadratic_polynomial (tr.iloc[:,:-1])
    temp2 = quadratic_polynomial (te.iloc[:,:-1])
    temp1 ['Y'] = f_original
    model_eln , pred_eln = elastic_net(temp1,temp2,hyper)
    return model_eln , pred_eln


""" Normalized Mean Absolute Error % """
def rmae(true, pred):
    return np.mean((abs(true-pred) / abs(true)) * 100)

""" This method implements and evaluates the Polynomial Surrogate Model with RMAE """
def surrogate_model(train_data,test_data,hyper,true):
    model_eln , pred_eln = polynomial (train_data,test_data,hyper)
    return rmae(true,pred_eln)

""" Implements all the surrogate models, i.e., for all test function, and returns the median of RMAE errors,
This median is used as the primary metric for Hyper-Parameters Optimization """
def perform_surrogate_modeling(paths,hyper):
    train_2, test_2, true_2 = load_f2(paths[0])
    rmae_2 = surrogate_model(train_2, test_2,hyper, true_2)
    
    train_3, test_3, true_3 = load_f3(paths[1])
    rmae_3 = surrogate_model(train_3, test_3,hyper, true_3)
    
    train_7, test_7, true_7 = load_f7(paths[2])
    rmae_7 = surrogate_model(train_7, test_7,hyper, true_7)
    
    train_9, test_9, true_9 = load_f9(paths[3])
    rmae_9 = surrogate_model(train_9, test_9,hyper, true_9)
    
    train_10, test_10, true_10 = load_f10(paths[4])
    rmae_10 = surrogate_model(train_10, test_10,hyper, true_10)
    
    train_13, test_13, true_13 = load_f13(paths[5])
    rmae_13 = surrogate_model(train_13, test_13,hyper, true_13)
    
    train_15, test_15, true_15 = load_f15(paths[6])
    rmae_15 = surrogate_model(train_15, test_15,hyper, true_15)
    
    train_16, test_16, true_16 = load_f16(paths[7])
    rmae_16 = surrogate_model(train_16, test_16,hyper, true_16)
    
    train_20, test_20, true_20 = load_f20(paths[8])
    rmae_20 = surrogate_model(train_20, test_20,hyper, true_20)
    
    train_24, test_24, true_24 = load_f24(paths[9])
    rmae_24 = surrogate_model(train_24, test_24,hyper, true_24)
    
    accuracy = [rmae_2,rmae_3,rmae_7,rmae_9,rmae_10,rmae_13,rmae_15,rmae_16,rmae_20,rmae_24]
    return accuracy


""" This is the function used for Hyper_Parameters_Optimization for both dimensionality reduction and surrogate modelling """
def hyper_parameters_optimization(paths,hyper):
    print ('Start Dimensionality Reduction:::')
    perform_dimensionality_reduction (paths[0], 2)
    perform_dimensionality_reduction (paths[1], 3)
    perform_dimensionality_reduction (paths[2], 7)
    perform_dimensionality_reduction (paths[3], 9)
    perform_dimensionality_reduction (paths[4], 10)
    perform_dimensionality_reduction (paths[5], 13)
    perform_dimensionality_reduction (paths[6], 15)
    perform_dimensionality_reduction (paths[7], 16)
    perform_dimensionality_reduction (paths[8], 20)
    perform_dimensionality_reduction (paths[9], 24)
    print ('End Dimensionality Reduction:::')
    accuracy = perform_surrogate_modeling (paths,hyper)
    return accuracy

## End Surrogate Modelling Code

## Set Paths

In [5]:
# Change the Paths here depending upon the dimensionality {50D=1000,100D=2000,200D=4000} 
path_2 = "Data Generation\\50 D\\Training_Data_Sets\\train_2_1000Samples.csv"
path_3 = "Data Generation\\50 D\\Training_Data_Sets\\train_3_1000Samples.csv"
path_7 = "Data Generation\\50 D\\Training_Data_Sets\\train_7_1000Samples.csv"
path_9 = "Data Generation\\50 D\\Training_Data_Sets\\train_9_1000Samples.csv"
path_10 = "Data Generation\\50 D\\Training_Data_Sets\\train_10_1000Samples.csv"
path_13 = "Data Generation\\50 D\\Training_Data_Sets\\train_13_1000Samples.csv"
path_15 = "Data Generation\\50 D\\Training_Data_Sets\\train_15_1000Samples.csv"
path_16 = "Data Generation\\50 D\\Training_Data_Sets\\train_16_1000Samples.csv"
path_20 = "Data Generation\\50 D\\Training_Data_Sets\\train_201000Samples.csv"
path_24 = "Data Generation\\50 D\\Training_Data_Sets\\train_241000Samples.csv"
paths = [path_2,path_3,path_7,path_9,path_10,path_13,path_15,path_16,path_20,path_24]

## Run

In [6]:
hyper_surrogate = [-0.7709327273466755, 0.48586287919279614]
accuracy = hyper_parameters_optimization(paths,hyper_surrogate)
print ('The Median accuracy:::'+str(np.median(accuracy)))

Start Dimensionality Reduction:::
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
Successfully Merged!!!
All Clear, Data Set Sorted in Ascending order !!!
End Dimensionality Reduction:::
The Median accuracy:::30.653104223065156


In [None]:
""" A): Hyper-Parameters for Polynomials::::

    Polynomials takes 2 hyper parameters: alpha and l1_ratio. Notably, both are float values
    https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html for more details
    
    Alpha (Continuous Variable, Ranges between -2 and 2, e.g., 0.1234
    l1_ratio (Continuous Variable, Ranges between 0 and 1, e.g., 0.97645
    
"""