In [2]:
import scipy.stats as ss
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from loess import loess_1d
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.svm import SVR  # for building SVR model

import sys
sys.path.insert(0,'..')

from BMR.bmr import *
from pyearth import Earth
import pandas as pd

In [3]:
def gen_model(X, a, b, c, d, e):
    return a*X[:, 0] + b*X[:, 1] + c*X[:, 0]**2 + d*X[:, 1]**2 + e*X[:, 0]*X[:, 1]
def gen_data(n=100, a=1, b=1, c=0, d=0, e=0, eps=0):
    norm = ss.norm()
    X = norm.rvs(size=(n, 2))
    y = gen_model(X=X, a=a, b=b, c=c, d=d, e=e)
    if eps>0:
        y += ss.norm(loc=0, scale=eps).rvs(size=(n, ))
    y = y[:, np.newaxis]
    return X, y

def get_mars_params(x, y):
    param_grid = {"max_terms": [1, 2, 3, 5, 10], "max_degree": [1, 2, 3, 4, 5]}
    mars = Earth()
    sh = HalvingGridSearchCV(mars, param_grid, cv=3, factor=3, n_jobs=-1).fit(x, y)
    return sh.best_params_

def get_svr_params(x, y):
    param_grid = {"C": [0.1, 1, 10, 100, 300, 500, 750, 1000, 1500, 2000, 3000], "degree": [1, 2, 3, 4],
                  "epsilon": [0.01, 0.1, 1, 10]}
    svr = SVR(kernel="rbf")
    sh = HalvingGridSearchCV(svr, param_grid, cv=5, factor=3, n_jobs=-1).fit(x, y)
    return sh.best_params_

def get_bmr_params(x, y, M, degree):
    epsilon_trial = (np.mean(np.std(x, axis=0))) / np.sqrt(x.shape[0]) * 3
    n_trial = x.shape[0]
    param_grid = {
        "epsilon": [epsilon_trial * t for t in [0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 3.0]],
        "min_n_pts": [int(n_trial * t) for t in [0.01, 0.05, 0.1, 0.2, 0.3]],
    }
    bmr = BMR(min_n_pts=n_trial / 10, M=M, substitution_policy="nearest", degree=degree, epsilon=epsilon_trial)
    sh = HalvingGridSearchCV(bmr, param_grid, cv=3, factor=3, n_jobs=-1).fit(x, y)
    params = sh.best_params_
    params['M'] = M
    return params

In [4]:
# generate points in which prediction is made
grid_points = np.arange(-3, 4, 1)
mesh_X, mesh_Y = np.meshgrid(grid_points, grid_points)
mesh_pts = np.array([np.ravel(mesh_X), np.ravel(mesh_Y)]).transpose()

In [9]:
def run_experiment(n, a, b, c, d, e, eps, mcloops=100):
    alpha = 0.05
    filename = f'CI_n={n}_a={a}_b={b}_c={c}_d{d}_e{e}_eps={eps}.csv'
    
    X_pred = mesh_pts
    y_true = gen_model(X_pred, a, b, c, d, e)

    #generate one sample to set method parameters
    X, y = gen_data(n=n, eps=eps, a=a, b=b, c=c)
    bmr_params = get_bmr_params(X, y, M=20, degree=1)
    mars_params = get_mars_params(X, y[:, 0])
    svr_params = get_svr_params(X, y[:, 0])
    
    # init methods
    methods = [LinearRegression(), BMR(**bmr_params), Earth(**mars_params), SVR(**svr_params)]
    methods_labels = ['LR', 'BMR', 'MARS', 'SVR']
    
    results = {}
    for method_label in methods_labels:
        results[method_label] = []

    for loop in range(mcloops):
        if loop % 10 == 0:
            print(f'Running loop {loop}/{mcloops} for {filename}')
        
        # run all methods on new data set
        X, y = gen_data(n=n, eps=eps, a=a, b=b, c=c)
        for method_label, method in zip(methods_labels, methods):
            if method_label == 'SVR':
                method.fit(X, y[:, 0])
            else:
                method.fit(X, y)
            pred = method.predict(X_pred)
            if len(pred.shape) > 1:
                pred = pred[:, 0]
            results[method_label].append(pred)
    
    # collect the results and prepare the csv
    df0 = pd.DataFrame([mesh_pts[:, 0], mesh_pts[:, 1]]).transpose()
    df0.columns = ['x', 'y']
    dfs = [df0]
    for method_label in methods_labels:
        dat = np.array(results[method_label]).transpose()
        ci_low = np.quantile(dat, q=alpha/2, axis=1)
        ci_up = np.quantile(dat, q=1-alpha/2, axis=1)
        mse = np.mean((dat - y_true.reshape(-1,1))**2, axis=1)
        df = pd.DataFrame([ci_low, ci_up, ci_up-ci_low, mse]).transpose()
        df.columns = [f'{method_label}_CI_low', f'{method_label}_CI_up', f'{method_label}_CI_len', f'{method_label}_MSE']
        dfs.append(df)
    df = pd.concat(dfs, axis=1)
    df.to_csv(filename, index_label=False)

In [None]:
epss = [0.01, 0.1, 0.5]
ns = [100, 1000]
mcloops = 1000

for n in ns:
    for eps in epss:
        run_experiment(n=n, a=0, b=0, c=0, d=0, e=0, eps=eps, mcloops=mcloops)
        run_experiment(n=n, a=1, b=2, c=0, d=0, e=0, eps=eps, mcloops=mcloops)
        run_experiment(n=n, a=1, b=2, c=-1, d=3, e=0, eps=eps, mcloops=mcloops)
        run_experiment(n=n, a=1, b=2, c=0, d=0, e=0.2, eps=eps, mcloops=mcloops)
        run_experiment(n=n, a=1, b=2, c=0, d=0, e=0.5, eps=eps, mcloops=mcloops)