In [1]:
import numpy as np
import scipy.stats as ss
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR # for building SVR model
import scipy
import catboost as cb
from LocalRegression.bmlr import BMLR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import ray
import pickle

In [2]:
def generate_Himmelblau(n=100, sigma=0.1):
    norm = ss.norm(loc=0, scale=sigma)
    x = (ss.uniform.rvs(size=(n, 2))-0.5)*5*2
    y = (x[:, 0]**2 + x[:, 1] - 11)**2 + (x[:, 0] + x[:, 1]**2 - 7)**2
    y += norm.rvs(n)
    return x, y

In [18]:
def model_random_forest(x, y, x_test, y_test):
    param_grid = {'n_estimators': [10, 100, 1000, 2500, 3000]}
    rf = RandomForestRegressor()
    sh = GridSearchCV(rf, param_grid, cv=3).fit(x, y)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_catboost(x, y, x_test, y_test):
    train_dataset = cb.Pool(x, y)
    test_dataset = cb.Pool(x_test, y_test)
    model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
    model.fit(train_dataset)
    pred = model.predict(test_dataset)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_mars():
    pass

def model_svr(x, y, x_test, y_test):
    param_grid = {'C': [0.1, 1, 100, 1000, 1000],
                  'degree': [2],
                  'epsilon': [0.01, 0.1, 1, 10]
                 }
    svr = SVR(kernel='rbf')
    #sh = HalvingGridSearchCV(model, param_grid, cv=5, factor=3, max_resources=50, n_jobs=-1).fit(x, y)
    #sh = RandomizedSearchCV(model, param_grid, cv=5).fit(x, y)
    sh = GridSearchCV(svr, param_grid, cv=3, verbose=1).fit(x, y)
    #print(sh.best_params_)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_bmlr(x, y, x_test, y_test, cut, M):
    epsilon_trial = (np.mean(np.std(x, axis=0)))/np.sqrt(x.shape[0])*3
    print(epsilon_trial)
    param_grid = {'epsilon': [epsilon_trial*x for x in [0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0]]}
    bmlr = BMLR(cut=cut, M=M, substitution_policy='nearest')
    gsh = GridSearchCV(bmlr, param_grid, cv=3, verbose=1)
    sh = gsh.fit(x, y)
    print(gsh.best_params_)
    pred = sh.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

In [15]:
n = 500
sigma=1
sample_x, sample_y = generate_Himmelblau(n=n, sigma=sigma)
sample_x_test, sample_y_test = generate_Himmelblau(n=int(n/4))

In [19]:
model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=10, M=5)

0.3804882352299299
Fitting 3 folds for each of 8 candidates, totalling 24 fits
{'epsilon': 0.3804882352299299}


(22.572135594912037,
 array([  7.68723837,  40.72029226,  60.49102092, 115.40952455,
        248.38457981,  63.53551549, 179.12821708, 290.25595812,
        299.11330304, 176.10534394, 175.42144107, 158.77313817,
          8.86890255, 255.63978699,  33.30226569, 178.87391358,
         60.22757791, 215.30011712, 405.16167666,  37.84317381,
         48.9117917 , 250.68442294, 235.51474284, 243.95315844,
         35.38096207,  71.57425032,  83.43379626, 102.07464491,
        161.74606307,  78.55972464, 113.15970169, 101.50287309,
         96.73744481, 250.68442294,  26.01039913,  38.95693774,
         79.36573168, 215.879926  ,  96.10473554, 129.39475227,
        306.63341318,  98.59011361,  82.94429446,  41.02922335,
        312.00398903, 245.73956647,  14.04477738,  54.63684587,
         35.06641863, 310.21482073,  65.6530962 , 141.77757908,
        400.07577356,  81.42320471, 197.72256174,  12.41971447,
        147.16852564, 133.62471042,  67.32533654,  81.21852496,
         31.8575618

In [None]:
ray.init()

In [6]:
@ray.remote
def run_experiment(ns=[100, 200, 500], mcloops=1, sigma=0.1):
    M=20
    scores = []
    for n in ns:
        print(n)
        for mcloop in range(mcloops):
            sample_x, sample_y = generate_Himmelblau(n=n, sigma=sigma)
            sample_x_test, sample_y_test = generate_Himmelblau(n=int(n/2))
            
            rf_score, rf_pred = model_random_forest(sample_x, sample_y, sample_x_test, sample_y_test)
            cb_score, cb_pred = model_catboost(sample_x, sample_y, sample_x_test, sample_y_test)
            svr_score, svr_pred = model_svr(sample_x, sample_y, sample_x_test, sample_y_test)
            bmlr_score_0, bmlr_pred_0 = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=0, M=M)
            bmlr_score_5, bmlr_pred_5 = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=5, M=M)
            bmlr_score_10, bmlr_pred_10 = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=10, M=M)
            bmlr_score_20, bmlr_pred_20 = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=20, M=M)
            
            scores.append([n, rf_score, cb_score, svr_score, bmlr_score_0, bmlr_score_5, bmlr_score_10, bmlr_score_20])
    df = pd.DataFrame(scores, columns=['n', 'RF', 'CB', 'SVR', 'BMLR_cut0', 'BMLR_cut5', 'BMLR_cut10', 'BMLR_cut20'])
    return df

In [7]:
ncpu = 14
loops = 20

for sigma in [0.1, 1, 5, 10]:
    results = []
    for loop in range(loops):
        print(f'sigma={sigma} loop={loop}')
        res = ray.get([run_experiment.remote(sigma=sigma) for i in range(ncpu)])
        results = results + res
    with open(f'comp_himmelbau_sigma={sigma}.pickle', "wb") as mypicklefile:
        pickle.dump(results, mypicklefile)

sigma=0.1 loop=0
[2m[36m(run_experiment pid=79096)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79093)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79097)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79098)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79100)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79092)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79099)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79104)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79105)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79094)[0m Fitting 3 folds for each of 8 candidates, t

KeyboardInterrupt: 