In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import Model_Training_and_Evaluation_Flow
from FT_D_Pipeline import ML_Pipeline

  from .autonotebook import tqdm as notebook_tqdm


## 輸入資料、資料切割

In [2]:
rawData = load_diabetes(as_frame = True)["data"]
rawData = pd.concat([rawData, load_diabetes(as_frame = True)["target"]], axis = 1)
print(rawData.head())
trainData, testData = train_test_split(rawData, test_size = 0.2, shuffle = True) 
trainData, valiData = train_test_split(trainData, test_size = 0.25, shuffle = True) 
trainData, valiData, testData = trainData.reset_index(drop = True), valiData.reset_index(drop = True), testData.reset_index(drop = True) 
print(trainData.info())
trainData.head()

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     264 non-null    float64
 1   sex     264 non-null    float64
 2   bmi     264 non-null    float64
 3   bp

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.092564,-0.044642,0.036907,0.021872,-0.02496,-0.016658,0.000779,-0.039493,-0.022517,-0.021788,70.0
1,-0.103593,0.05068,-0.023451,-0.022885,-0.086878,-0.067701,-0.017629,-0.039493,-0.07814,-0.071494,71.0
2,0.096197,-0.044642,0.051996,0.079265,0.054845,0.036577,-0.076536,0.141322,0.098648,0.061054,230.0
3,0.009016,0.05068,-0.039618,0.028758,0.038334,0.073529,-0.072854,0.108111,0.015568,-0.046641,91.0
4,-0.02731,0.05068,-0.007284,-0.040099,-0.011201,-0.01384,0.059685,-0.039493,-0.082379,-0.02593,52.0


## 特徵轉換、降維或核函數

In [3]:
ml = ML_Pipeline(ml_methods = ["standardization", "Poly-Kernel", "PCA"], inputFeatures = trainData.columns.tolist()[:-1], target = "target")
ml.fit_Pipeline(fit_data = trainData)
trainData, valiData, testData = [
    ml.transform_Pipeline(transform_data = i) for i in [trainData, valiData, testData]
] 

In [4]:
print(trainData.info())
trainData.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 56 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age_degree_2  264 non-null    float64
 1   sex_degree_2  264 non-null    float64
 2   bmi_degree_2  264 non-null    float64
 3   bp_degree_2   264 non-null    float64
 4   s1_degree_2   264 non-null    float64
 5   s2_degree_2   264 non-null    float64
 6   s3_degree_2   264 non-null    float64
 7   s4_degree_2   264 non-null    float64
 8   s5_degree_2   264 non-null    float64
 9   s6_degree_2   264 non-null    float64
 10  age_sex       264 non-null    float64
 11  age_bmi       264 non-null    float64
 12  age_bp        264 non-null    float64
 13  age_s1        264 non-null    float64
 14  age_s2        264 non-null    float64
 15  age_s3        264 non-null    float64
 16  age_s4        264 non-null    float64
 17  age_s5        264 non-null    float64
 18  age_s6        264 non-null    

Unnamed: 0,age_degree_2,sex_degree_2,bmi_degree_2,bp_degree_2,s1_degree_2,s2_degree_2,s3_degree_2,s4_degree_2,s5_degree_2,s6_degree_2,...,s2_s4,s2_s5,s2_s6,s3_s4,s3_s5,s3_s6,s4_s5,s4_s6,s5_s6,target
0,-2.769929,-0.599155,0.584967,-0.073418,0.126724,-1.338794,-0.397889,-0.470546,-0.772793,-0.106306,...,0.035971,0.048381,-0.009269,-0.083759,-0.046555,0.014838,0.034237,-0.007877,0.005868,70.0
1,4.90144,-4.361905,-4.188311,5.112012,-1.428653,0.552662,3.763177,1.746921,2.277947,-0.674398,...,0.050776,0.030395,0.011444,0.092089,0.046473,-0.07828,-0.051773,-0.002002,0.004739,71.0
2,12.75101,4.099561,-1.290797,7.631537,-6.999528,0.088017,-0.016464,1.351057,0.393095,5.792065,...,-0.092091,0.099588,0.039632,-0.147893,-0.066082,0.118019,-0.032968,-0.017209,-0.020292,230.0
3,1.365821,-1.484852,4.321167,-3.263265,-1.771982,-1.807987,-1.624449,0.101493,-0.540423,2.744129,...,-0.012172,0.018821,0.005807,0.106323,0.017258,0.004187,0.032771,-0.018571,0.005884,91.0
4,-0.182211,1.283725,0.666716,3.332083,-2.296442,-0.883901,-1.053245,1.248307,0.679593,0.871258,...,0.031936,0.019423,0.088028,0.080516,0.047643,-0.064905,0.002504,0.000913,-0.000673,52.0


## 模型訓練

In [5]:
totalResult = Model_Training_and_Evaluation_Flow.model_fit(trainData = trainData,
                                                 valiData = valiData,
                                                 testData = testData,
                                                input_features = trainData.drop(columns = ["target"]).columns.tolist(), 
                                                target_label = "target", 
                                                target_type = "regression",
                                                main_metric = "mse", 
                                                feature_selection_method = None, 
                                                hyperparameter_tuning = "bayesopt", 
                                                feature_importances = None,
                                                model_file_name = None)

[32m[I 2023-02-20 11:25:23,821][0m A new study created in memory with name: no-name-14cc331e-9350-48f9-886b-2c3e299fac0a[0m
[32m[I 2023-02-20 11:25:23,986][0m Trial 1 finished with value: 6042.279788211544 and parameters: {'num_leaves': 95, 'max_depth': 32, 'learning_rate': 0.05514389373480578, 'n_estimators': 107, 'min_split_gain': 0.21144474997378715, 'min_child_weight': 0.05768028694694168, 'min_child_samples': 92, 'subsample': 0.1954937020102615, 'colsample_bytree': 0.691397450137756, 'reg_alpha': 0.18186421865641264, 'reg_lambda': 0.41142544502726947}. Best is trial 1 with value: 6042.279788211544.[0m
[32m[I 2023-02-20 11:25:24,023][0m Trial 2 finished with value: 5928.964235815163 and parameters: {'num_leaves': 5, 'max_depth': 21, 'learning_rate': 0.07068359486168968, 'n_estimators': 155, 'min_split_gain': 0.8361167958159631, 'min_child_weight': 0.07096363185243974, 'min_child_samples': 87, 'subsample': 0.27300901682571244, 'colsample_bytree': 0.7844320890335267, 'reg_alp

In [14]:
# 模型評估結果
totalResult[0][3]

{'Model': 'LightGBM with ExtraTrees',
 'Features': ['age_degree_2',
  'sex_degree_2',
  'bmi_degree_2',
  'bp_degree_2',
  's1_degree_2',
  's2_degree_2',
  's3_degree_2',
  's4_degree_2',
  's5_degree_2',
  's6_degree_2',
  'age_sex',
  'age_bmi',
  'age_bp',
  'age_s1',
  'age_s2',
  'age_s3',
  'age_s4',
  'age_s5',
  'age_s6',
  'sex_bmi',
  'sex_bp',
  'sex_s1',
  'sex_s2',
  'sex_s3',
  'sex_s4',
  'sex_s5',
  'sex_s6',
  'bmi_bp',
  'bmi_s1',
  'bmi_s2',
  'bmi_s3',
  'bmi_s4',
  'bmi_s5',
  'bmi_s6',
  'bp_s1',
  'bp_s2',
  'bp_s3',
  'bp_s4',
  'bp_s5',
  'bp_s6',
  's1_s2',
  's1_s3',
  's1_s4',
  's1_s5',
  's1_s6',
  's2_s3',
  's2_s4',
  's2_s5',
  's2_s6',
  's3_s4',
  's3_s5',
  's3_s6',
  's4_s5',
  's4_s6',
  's5_s6'],
 'Set': 'train',
 'Number_of_Data': 264,
 'MAE': 19.99369348797126,
 'MSE': 590.3199250115812,
 'RMSE': 24.296500262621798,
 'R2': 0.9016171230378656}