In [16]:
import numpy as np
import scipy.stats as ss
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR # for building SVR model
import scipy
import catboost as cb
from LocalRegression.bmlr import BMLR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import ray

In [2]:
def generate_Himmelblau(n=100, sigma=0.1):
    norm = ss.norm(loc=0, scale=sigma)
    x = (ss.uniform.rvs(size=(n, 2))-0.5)*5*2
    y = (x[:, 0]**2 + x[:, 1] - 11)**2 + (x[:, 0] + x[:, 1]**2 - 7)**2
    y += norm.rvs(n)
    return x, y

In [10]:
def model_random_forest(x, y, x_test, y_test):
    param_grid = {'n_estimators': [10, 100, 1000, 2500, 3000]}
    rf = RandomForestRegressor()
    sh = GridSearchCV(rf, param_grid, cv=3).fit(x, y)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_catboost(x, y, x_test, y_test):
    train_dataset = cb.Pool(x, y)
    test_dataset = cb.Pool(x_test, y_test)
    model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
    model.fit(train_dataset)
    pred = model.predict(test_dataset)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_mars():
    pass

def model_svr(x, y, x_test, y_test):
    param_grid = {'C': [0.1, 1, 100, 1000, 1000],
                  'degree': [2],
                  'epsilon': [0.01, 0.1, 1, 10]
                 }
    svr = SVR(kernel='rbf')
    #sh = HalvingGridSearchCV(model, param_grid, cv=5, factor=3, max_resources=50, n_jobs=-1).fit(x, y)
    #sh = RandomizedSearchCV(model, param_grid, cv=5).fit(x, y)
    sh = GridSearchCV(svr, param_grid, cv=3, verbose=1).fit(x, y)
    #print(sh.best_params_)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_bmlr(x, y, x_test, y_test):
    epsilon_trial = np.mean(np.std(x, axis=0))/5
    param_grid = {'epsilon': [epsilon_trial*x for x in [0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0]]}
    bmlr = BMLR(cut=10, M=5, substitution_policy='nearest')
    gsh = GridSearchCV(bmlr, param_grid, cv=3, verbose=1)
    sh = gsh.fit(x, y)
    #print(sh.best_params_)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

In [24]:
@ray.remote
def run_experiment(ns=[50, 100, 200, 500], mcloops=1):
    scores = []
    for n in ns:
        for mcloop in range(mcloops):
            sample_x, sample_y = generate_Himmelblau(n=n)
            sample_x_test, sample_y_test = generate_Himmelblau(n=int(n/2))
            bmlr_score, bmlr_pred = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test)
            rf_score, rf_pred = model_random_forest(sample_x, sample_y, sample_x_test, sample_y_test)
            cb_score, cb_pred = model_catboost(sample_x, sample_y, sample_x_test, sample_y_test)
            svr_score, svr_pred = model_svr(sample_x, sample_y, sample_x_test, sample_y_test)
            scores.append([n, rf_score, cb_score, svr_score, bmlr_score])
    df = pd.DataFrame(scores, columns=['n', 'RF', 'CB', 'SVR', 'BMLR'])
    return df

In [23]:
ncpu = 3
loops = 2
results = []
for loop in range(loops):
    res = ray.get([run_experiment.remote() for i in range(ncpu)])
    results = results + res

[2m[36m(run_experiment pid=231088)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=231091)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=231087)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=231088)[0m Fitting 3 folds for each of 20 candidates, totalling 60 fits


RayTaskError(NameError): [36mray::run_experiment()[39m (pid=231088, ip=192.168.1.109)
  File "/tmp/ipykernel_230092/2186798105.py", line 11, in run_experiment
NameError: name 'scores' is not defined

[2m[36m(run_experiment pid=231091)[0m Fitting 3 folds for each of 20 candidates, totalling 60 fits
[2m[36m(run_experiment pid=231087)[0m Fitting 3 folds for each of 20 candidates, totalling 60 fits


2022-04-11 16:57:53,571	ERROR worker.py:83 -- Unhandled error (suppress with RAY_IGNORE_UNHANDLED_ERRORS=1): [36mray::run_experiment()[39m (pid=231091, ip=192.168.1.109)
  File "/tmp/ipykernel_230092/2186798105.py", line 11, in run_experiment
NameError: name 'scores' is not defined
2022-04-11 16:57:56,572	ERROR worker.py:83 -- Unhandled error (suppress with RAY_IGNORE_UNHANDLED_ERRORS=1): [36mray::run_experiment()[39m (pid=231087, ip=192.168.1.109)
  File "/tmp/ipykernel_230092/2186798105.py", line 11, in run_experiment
NameError: name 'scores' is not defined
