In [11]:
import numpy as np
import scipy.stats as ss
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR # for building SVR model
import scipy
import catboost as cb
from LocalRegression.bmlr import BMLR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import ray
import pickle

In [2]:
def generate_Himmelblau(n=100, sigma=0.1):
    norm = ss.norm(loc=0, scale=sigma)
    x = (ss.uniform.rvs(size=(n, 2))-0.5)*5*2
    y = (x[:, 0]**2 + x[:, 1] - 11)**2 + (x[:, 0] + x[:, 1]**2 - 7)**2
    # scale so that all x points are in [0,1]x[0,1] and all y points are in [0,1]
    x = (x + 5)/10
    y = y/890
    if sigma>0:
        y += norm.rvs(n)
    return x, y

In [52]:
def model_random_forest_params(x, y):
    #param_grid = {'n_estimators': [10, 100, 1000, 2500, 3000]}
    param_grid = {'n_estimators': [10, 100]}
    rf = RandomForestRegressor()
    gsh = GridSearchCV(rf, param_grid, cv=3).fit(x, y)
    #print(gsh.best_params_)
    return gsh.best_params_

def model_random_forest(x, y, x_test, y_test, params):
    model = RandomForestRegressor(**params)
    model.fit(x, y)
    pred = model.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_catboost(x, y, x_test, y_test):
    train_dataset = cb.Pool(x, y)
    test_dataset = cb.Pool(x_test, y_test)
    model = cb.CatBoostRegressor(loss_function='RMSE', verbose=0)
    model.fit(train_dataset)
    pred = model.predict(test_dataset)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_mars():
    pass

def model_svr_params(x, y):
    param_grid = {'C': [0.1, 1, 100, 1000, 1000],
                  'degree': [2],
                  'epsilon': [0.01, 0.1, 1, 10]
                 }
    svr = SVR(kernel='rbf')
    gsh = GridSearchCV(svr, param_grid, cv=3, verbose=1).fit(x, y)
    return gsh.best_params_

def model_svr(x, y, x_test, y_test, params):
    model = SVR(kernel='rbf', **params)
    model.fit(x, y)
    pred = model.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

def model_bmlr_params(x, y, cut, M, include_y):
    epsilon_trial = (np.mean(np.std(x, axis=0)))/np.sqrt(x.shape[0])*3
    param_grid = {'epsilon': [epsilon_trial*x for x in [0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 3.0]]}
    #param_grid = {'epsilon': [epsilon_trial*x for x in [0.5, 0.8]]}
    bmlr = BMLR(cut=cut, M=M, substitution_policy='nearest', include_y=include_y)
    gsh = GridSearchCV(bmlr, param_grid, cv=3, verbose=1).fit(x, y)
    print(gsh.best_params_)
    return gsh.best_params_

def model_bmlr(x, y, x_test, y_test, cut, M, include_y, params):
    model = BMLR(cut=cut, M=M, substitution_policy='nearest', include_y=include_y, **params)
    model.fit(x, y)
    pred = model.predict(x_test)
    score = mean_squared_error(pred, y_test, squared=False)
    return score, pred

In [None]:
ray.init()

In [55]:
#@ray.remote
def run_experiment(ns=[100, 200, 500], mcloops=10, sigma=0.01):
    train_ratio = 0.3
    M=25
    
    scores = []
    for n in ns:
        # find optimal params for each method
        # to speed up parameters are established on one sample only
        sample_x, sample_y = generate_Himmelblau(n=int(n*(1-train_ratio)), sigma=sigma)
        rf_params = model_random_forest_params(sample_x, sample_y)
        svr_params = model_svr_params(sample_x, sample_y)
        bmlr_params_0 = model_bmlr_params(sample_x, sample_y, cut=0, M=M, include_y=False)
        bmlr_params_1 = model_bmlr_params(sample_x, sample_y, cut=10, M=M, include_y=False)
        bmlr_params_2 = model_bmlr_params(sample_x, sample_y, cut=20, M=M, include_y=False)
        bmlr_params_3 = model_bmlr_params(sample_x, sample_y, cut=30, M=M, include_y=False)
        bmlr_params_0y = model_bmlr_params(sample_x, sample_y, cut=0, M=M, include_y=True)
        bmlr_params_1y = model_bmlr_params(sample_x, sample_y, cut=10, M=M, include_y=True)
        bmlr_params_2y = model_bmlr_params(sample_x, sample_y, cut=20, M=M, include_y=True)
        bmlr_params_3y = model_bmlr_params(sample_x, sample_y, cut=30, M=M, include_y=True)
        
        
        for mcloop in range(mcloops):
            sample_x, sample_y = generate_Himmelblau(n=n, sigma=sigma)
            sample_x, sample_x_test, sample_y, sample_y_test = train_test_split(sample_x, sample_y, test_size=train_ratio)
            
            rf_score, _ = model_random_forest(sample_x, sample_y, sample_x_test, sample_y_test, rf_params)
            svr_score, _ = model_svr(sample_x, sample_y, sample_x_test, sample_y_test, svr_params)
            bmlr_score0, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=0, M=M, include_y=False, params=bmlr_params_0)
            bmlr_score1, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=10, M=M, include_y=False, params=bmlr_params_1)
            bmlr_score2, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=20, M=M, include_y=False, params=bmlr_params_2)
            bmlr_score3, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=30, M=M, include_y=False, params=bmlr_params_3)
            bmlr_score0y, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=0, M=M, include_y=True, params=bmlr_params_0y)
            bmlr_score1y, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=10, M=M, include_y=True, params=bmlr_params_1y)
            bmlr_score2y, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=20, M=M, include_y=True, params=bmlr_params_2y)
            bmlr_score3y, _ = model_bmlr(sample_x, sample_y, sample_x_test, sample_y_test, cut=30, M=M, include_y=True, params=bmlr_params_3y)
            cb_score, _ = model_catboost(sample_x, sample_y, sample_x_test, sample_y_test)
            
            scores.append([n, rf_score, cb_score, svr_score, bmlr_score0, bmlr_score1, bmlr_score2, bmlr_score3,
                          bmlr_score0y, bmlr_score1y, bmlr_score2y, bmlr_score3y])
    df = pd.DataFrame(scores, columns=['n', 'RF', 'CB', 'SVR', 
                                       'BMLR_cut0', 'BMLR_cut10', 'BMLR_cut20', 'BMLR_cut30',
                                       'BMLR_cut0_y', 'BMLR_cut10_y', 'BMLR_cut20_y', 'BMLR_cut30_y'])
    return df

In [7]:
ncpu = 14
loops = 1

for sigma in [0, 0.01, 0.1, 0.2]:
    results = []
    for loop in range(loops):
        print(f'sigma={sigma} loop={loop}')
        res = ray.get([run_experiment.remote(sigma=sigma) for i in range(ncpu)])
        results = results + res
    with open(f'comp_himmelbau_sigma={sigma}.pickle', "wb") as mypicklefile:
        pickle.dump(results, mypicklefile)

sigma=0.1 loop=0
[2m[36m(run_experiment pid=79096)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79093)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79097)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79098)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79100)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79092)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79099)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79104)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79105)[0m Fitting 3 folds for each of 8 candidates, totalling 24 fits
[2m[36m(run_experiment pid=79094)[0m Fitting 3 folds for each of 8 candidates, t

KeyboardInterrupt: 