In [1]:
#import module (library)

import pandas as pd
import pycaret
import numpy as np
import math
import matplotlib.pyplot as plt
from pycaret.regression import *

from pycaret.regression import load_model

import plotly.io as pio
pio.renderers.default = 'iframe_connected'

import time

from datetime import datetime

In [2]:
# import module (user defined function)

from py_module.load_data import load_data
from py_module.plot_data import plot_histogram
from py_module.regression import *
from py_module.pre_processing import *
from py_module.verify import *

In [3]:
# === import raw_data (from csv file) ===

filename = "./Data_2021_10_14_v1 (N98923)/Data.csv" # csv file directory and name

raw_data = load_data(fn=filename, pp=1)


## === dataset pre-processing ===

# drop output data except for target output

parameter = "Llt" # target output pamareter
processed_data = drop_output(raw_data, parameter)



# cut data
## - opt
# lo : lower bound value (default : -inf)
# hi : upper bound value (default : inf)

processed_data = cut_data(processed_data, parameter="Llt", lo=0.1, hi=40)



# add feature
# 기존에 존재하는 input parameter들을 이용해서 물리적인 의미를 갖는 새로운 파라미터를 만들어 낼 시 모델의 성능을 증가시킬 수 있음
# ex> 변압기 자화 인덕턴스는 턴수의 제곱에 비례하므로 턴수의 제곱에 해당하는 파라미터를 새로 만들어 자화인덕터 regression 모델을 만들 경우 모델 성능 증가

new_feature_names = []

processed_data = add_feature(processed_data, parameter, new_feature_names = new_feature_names)



processed_data

Unnamed: 0,N1,N2,d1,d2,freq,move_tx,move_rx,offset_tx,offset_rx,per,space1,space2,space3,space4,l1,l2,h1,w1,Llt
0,8,8,6.0,9.7,49000.0,2,5,7,-9,1480,1,23,3,55,19,41,166,24,6.704324
1,8,8,11.7,10.7,49000.0,6,4,18,-5,1160,5,47,3,31,31,72,251,248,13.703235
2,8,8,9.0,5.4,49000.0,4,1,9,-18,4192,5,31,7,28,33,40,191,212,19.559244
3,8,8,9.5,4.2,49000.0,6,3,14,-7,3419,1,21,6,38,15,30,185,221,16.173505
4,11,11,7.5,10.3,58000.0,1,2,13,-17,4445,7,50,6,46,16,69,207,241,35.181363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97610,9,9,8.4,9.2,42000.0,7,2,8,-14,2483,9,59,9,35,30,71,235,105,14.449801
97611,5,5,11.4,8.9,25000.0,3,4,12,-6,1838,2,50,3,47,23,62,161,118,7.164551
97612,5,5,8.6,9.6,25000.0,2,2,17,-2,4913,1,21,3,35,29,51,110,80,3.697227
97613,5,5,11.0,3.6,25000.0,1,7,18,-12,4437,1,33,4,31,26,59,131,170,8.532143


In [4]:
# === compare algorithm ===
# 여러 regression 알고리즘 중 가장 높은 성능을 내는 알고리즘 탐색 (모든 알고리즘 탐색)

start_time_t = time.time()

# activate logger
[model, data_seen, data_unseen] = regression_basic(processed_data, parameter, algorithm="lightgbm", frac_ratio=0.9, save_en=False, save_model_name="model", new_feature_names=new_feature_names)


# variable
algorithm_list = models().index
except_list = ["kr","svm"] # algorithm list to exclude from train
result = []


# eleminate algorithm in exception list
for al_name in except_list :

    algorithm_list = algorithm_list[algorithm_list!=al_name]

    
# train each algorithm
for al_name in algorithm_list :

    start_time = time.time()

    [model, data_seen, data_unseen] = regression_basic(processed_data, parameter, algorithm=al_name, new_feature_names=new_feature_names)
    [R2, MAE, MSE, RMSE, MPE] = verify_model(model, data_seen, data_unseen, parameter)

    end_time= time.time()
    timetime = end_time - start_time

    result.append([al_name, R2, MAE, MSE, RMSE, MPE, timetime])

end_time_t = time.time()
timetime_t = end_time_t - start_time_t
print(f'total time : {timetime_t}')


# compare model result
pd.DataFrame(result,columns = ["algorithm","R2","MAE","MSE","RMSE","MPE(%)","time(s)"]).sort_values(by='R2' ,ascending=False).reset_index(drop=True)


Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.7227,1.1163,1.0566,0.9788,0.0743,0.0632
1,0.7299,1.1384,1.0669,0.9785,0.0715,0.0611
2,0.738,1.1633,1.0785,0.9788,0.073,0.0621
3,0.7279,1.1925,1.092,0.9766,0.0754,0.0633
4,0.7335,1.1579,1.0761,0.9778,0.0724,0.0615
5,0.7364,1.2226,1.1057,0.9764,0.0731,0.0619
6,0.7259,1.1502,1.0725,0.9783,0.0744,0.0623
7,0.7321,1.134,1.0649,0.9781,0.0745,0.0634
8,0.7266,1.2335,1.1106,0.9764,0.076,0.063
9,0.7114,1.1359,1.0658,0.9791,0.0722,0.0608


total time : 130.08616018295288


Unnamed: 0,algorithm,R2,MAE,MSE,RMSE,MPE(%),time(s)
0,et,0.9813814,0.426787,1.002576,1.001287,3.690037,12.056003
1,lightgbm,0.9811564,0.6788,1.014695,1.007321,5.89427,2.011079
2,rf,0.9706892,0.786602,1.578331,1.256316,6.676564,17.929073
3,gbr,0.9373347,1.222579,3.37441,1.836957,10.293032,19.092005
4,dt,0.9314065,0.822924,3.693629,1.921882,6.80443,2.128563
5,lar,0.8799973,1.865993,6.461922,2.542031,21.508629,1.19528
6,ridge,0.8799971,1.865992,6.461931,2.542033,21.508572,2.858722
7,br,0.8799968,1.865972,6.461948,2.542036,21.507323,1.271193
8,lr,0.8799959,1.866016,6.461994,2.542045,21.508973,3.621023
9,ard,0.8798298,1.869401,6.47094,2.543804,21.499951,1.310238


In [6]:
# compare algorithm (tuned case)
# 여러 regression 알고리즘 중 가장 높은 성능을 내는 알고리즘 탐색 (모든 알고리즘 탐색)
# 각각의 algorithm은 auto tune을 이용하여 튜닝

# activate logger
[model, data_seen, data_unseen] = regression_basic(processed_data, parameter, algorithm="lightgbm", frac_ratio=0.9, save_en=False, save_model_name="model", new_feature_names=new_feature_names)


# variable
algorithm_list = models().index
except_list = ["kr","svm","huber"] # algorithm list to exclude from train
result = []


# eleminate algorithm in exception list
for al_name in except_list :

    algorithm_list = algorithm_list[algorithm_list!=al_name]

    
# train each algorithm
for al_name in algorithm_list :

    start_time = time.time()

    [model, data_seen, data_unseen] = regression_basic(processed_data, parameter, algorithm=al_name, new_feature_names=new_feature_names)
    print(f'{al_name}')

    try : 
        tuned_model = tune_model(model, n_iter=100, optimize="MAE", early_stopping=False, choose_better=True, verbose=True, search_library="tune-sklearn", search_algorithm="hyperopt")
        [R2, MAE, MSE, RMSE, MPE] = verify_model(tuned_model, data_seen, data_unseen, parameter)
        end_time= time.time()
        timetime = end_time - start_time
        result.append([al_name, R2, MAE, MSE, RMSE, MPE, timetime])
    except :
        print(f'error: {al_name}')
    

    


# compare model result
pd.DataFrame(result,columns = ["algorithm","R2","MAE","MSE","RMSE","MPE(%)","time(s)"]).sort_values(by='R2' ,ascending=False).reset_index(drop=True)


Trial _Trainable_f8fa40a4: Error processing event.


RayTaskError(ValueError): [36mray::_Trainable.train_buffered()[39m (pid=32864, ip=127.0.0.1, repr=<tune_sklearn._trainable._Trainable object at 0x00000249B10BFEB0>)
  File "python\ray\_raylet.pyx", line 625, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 629, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 578, in ray._raylet.execute_task.function_executor
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\_private\function_manager.py", line 609, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\util\tracing\tracing_helper.py", line 451, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\tune\trainable.py", line 255, in train_buffered
    result = self.train()
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\util\tracing\tracing_helper.py", line 451, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\tune\trainable.py", line 314, in train
    result = self.step()
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\util\tracing\tracing_helper.py", line 451, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\tune_sklearn\_trainable.py", line 106, in step
    return self._train()
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\ray\util\tracing\tracing_helper.py", line 451, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\tune_sklearn\_trainable.py", line 237, in _train
    scores = cross_validate(
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\model_selection\_validation.py", line 242, in cross_validate
    scores = parallel(
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\parallel.py", line 1044, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\model_selection\_validation.py", line 560, in _fit_and_score
    test_scores = _score(estimator, X_test, y_test, scorer)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\model_selection\_validation.py", line 607, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\metrics\_scorer.py", line 212, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\metrics\_regression.py", line 178, in mean_absolute_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\metrics\_regression.py", line 86, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
    return f(**kwargs)
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\utils\validation.py", line 644, in check_array
    _assert_all_finite(array,
  File "C:\Users\user\anaconda3\envs\NEC_stable\lib\site-packages\sklearn\utils\validation.py", line 96, in _assert_all_finite
    raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').