In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import Model_Training_and_Evaluation_Flow
from FT_D_Pipeline import ML_Pipeline

  from .autonotebook import tqdm as notebook_tqdm


## 輸入資料、資料切割

In [2]:
rawData = load_diabetes(as_frame = True)["data"]
rawData = pd.concat([rawData, load_diabetes(as_frame = True)["target"]], axis = 1)
print(rawData.head())
trainData, testData = train_test_split(rawData, test_size = 0.2, shuffle = True) 
trainData, valiData = train_test_split(trainData, test_size = 0.25, shuffle = True) 
trainData, valiData, testData = trainData.reset_index(drop = True), valiData.reset_index(drop = True), testData.reset_index(drop = True) 
print(trainData.info())
trainData.head()

        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     264 non-null    float64
 1   sex     264 non-null    float64
 2   bmi     264 non-null    float64
 3   bp

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.016281,-0.044642,0.073552,-0.041235,-0.004321,-0.013527,-0.013948,-0.001116,0.042897,0.044485,275.0
1,0.041708,0.05068,-0.043929,0.063187,-0.004321,0.016222,-0.013948,-0.002592,-0.034522,0.011349,71.0
2,0.034443,0.05068,0.111276,0.076958,-0.03184,-0.033881,-0.021311,-0.002592,0.02802,0.07348,336.0
3,-0.01278,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.03846,-0.038357,64.0
4,-0.096328,-0.044642,-0.036385,-0.074527,-0.03872,-0.027618,0.015505,-0.039493,-0.074093,-0.001078,200.0


## 特徵轉換、降維或核函數

In [3]:
ml = ML_Pipeline(ml_methods = ["standardization", "Poly-Kernel", "PCA"], inputFeatures = trainData.columns.tolist()[:-1], target = "target")
ml.fit_Pipeline(fit_data = trainData)
trainData, valiData, testData = [
    ml.transform_Pipeline(transform_data = i) for i in [trainData, valiData, testData]
] 

In [4]:
print(trainData.info())
trainData.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 56 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age_degree_2  264 non-null    float64
 1   sex_degree_2  264 non-null    float64
 2   bmi_degree_2  264 non-null    float64
 3   bp_degree_2   264 non-null    float64
 4   s1_degree_2   264 non-null    float64
 5   s2_degree_2   264 non-null    float64
 6   s3_degree_2   264 non-null    float64
 7   s4_degree_2   264 non-null    float64
 8   s5_degree_2   264 non-null    float64
 9   s6_degree_2   264 non-null    float64
 10  age_sex       264 non-null    float64
 11  age_bmi       264 non-null    float64
 12  age_bp        264 non-null    float64
 13  age_s1        264 non-null    float64
 14  age_s2        264 non-null    float64
 15  age_s3        264 non-null    float64
 16  age_s4        264 non-null    float64
 17  age_s5        264 non-null    float64
 18  age_s6        264 non-null    

Unnamed: 0,age_degree_2,sex_degree_2,bmi_degree_2,bp_degree_2,s1_degree_2,s2_degree_2,s3_degree_2,s4_degree_2,s5_degree_2,s6_degree_2,...,s2_s4,s2_s5,s2_s6,s3_s4,s3_s5,s3_s6,s4_s5,s4_s6,s5_s6,target
0,-2.840267,0.140716,-0.33284,0.338019,-0.76773,0.518062,-0.068725,-1.783893,0.799309,-0.221183,...,-0.09576,0.038966,-0.103678,0.074349,0.047419,0.030409,0.000123,0.001777,-0.001915,275.0
1,-3.016556,0.84465,-0.271922,-0.509445,-1.08901,-0.752907,0.195446,0.129498,0.032075,-0.662637,...,0.030171,-0.013035,-0.029117,0.028234,0.027253,0.006948,-0.015389,-0.002768,0.001821,71.0
2,-1.572925,-4.800622,-1.154595,-3.997421,1.546803,2.673772,-0.128951,0.236734,3.88272,-3.185419,...,-0.162488,-0.020622,-0.074379,-0.046829,-0.09973,-0.018196,0.014802,-0.009437,-0.004594,336.0
3,-2.455592,-0.243547,-0.667047,-0.238399,-1.320306,0.124802,0.115952,0.53708,0.433337,0.28311,...,-0.019434,-0.022553,-0.057597,-0.010944,-0.00548,-0.027005,0.012819,0.003995,0.003047,64.0
4,1.215585,-2.122598,-2.514074,1.118674,1.054862,-3.956674,3.571153,1.506128,0.406094,-0.952415,...,-0.006095,0.137083,0.192046,-0.125676,0.026688,-0.043915,0.069717,0.002438,-0.006685,200.0


## 模型訓練

In [5]:
totalResult = Model_Training_and_Evaluation_Flow.model_fit(trainData = trainData,
                                                 valiData = valiData,
                                                 testData = testData,
                                                input_features = trainData.drop(columns = ["target"]).columns.tolist(), 
                                                target_label = "target", 
                                                target_type = "regression",
                                                main_metric = "mse", 
                                                feature_selection_method = None, 
                                                hyperparameter_tuning = "bayesopt", 
                                                feature_importances = None,
                                                model_file_name = None)

[32m[I 2023-02-06 01:19:18,831][0m A new study created in memory with name: no-name-de89a25b-1394-4606-ab53-74e309fccca8[0m
[32m[I 2023-02-06 01:19:22,088][0m Trial 5 finished with value: 12677.342325873798 and parameters: {'n_estimators': 933, 'max_depth': 261, 'max_leaves': 170, 'max_bin': 83, 'learning_rate': 0.0007608134443329099, 'tree_method': 'hist', 'subsample': 0.23753049786240743, 'colsample_bytree': 0.21681546391426068, 'colsample_bylevel': 0.44177203861050784, 'colsample_bynode': 0.6161866534084841, 'reg_alpha': 0.30954471642498615, 'reg_lambda': 0.4530278043556554}. Best is trial 5 with value: 12677.342325873798.[0m
[32m[I 2023-02-06 01:19:25,028][0m Trial 0 finished with value: 5533.544119712405 and parameters: {'n_estimators': 1812, 'max_depth': 396, 'max_leaves': 258, 'max_bin': 90, 'learning_rate': 0.004312570719973793, 'tree_method': 'hist', 'subsample': 0.16323733598155804, 'colsample_bytree': 0.3102783100397322, 'colsample_bylevel': 0.12939279294179631, 'col

In [6]:
# 模型評估結果
totalResult[0]

[{'Model': 'XGBoost',
  'Features': ['age_degree_2',
   'sex_degree_2',
   'bmi_degree_2',
   'bp_degree_2',
   's1_degree_2',
   's2_degree_2',
   's3_degree_2',
   's4_degree_2',
   's5_degree_2',
   's6_degree_2',
   'age_sex',
   'age_bmi',
   'age_bp',
   'age_s1',
   'age_s2',
   'age_s3',
   'age_s4',
   'age_s5',
   'age_s6',
   'sex_bmi',
   'sex_bp',
   'sex_s1',
   'sex_s2',
   'sex_s3',
   'sex_s4',
   'sex_s5',
   'sex_s6',
   'bmi_bp',
   'bmi_s1',
   'bmi_s2',
   'bmi_s3',
   'bmi_s4',
   'bmi_s5',
   'bmi_s6',
   'bp_s1',
   'bp_s2',
   'bp_s3',
   'bp_s4',
   'bp_s5',
   'bp_s6',
   's1_s2',
   's1_s3',
   's1_s4',
   's1_s5',
   's1_s6',
   's2_s3',
   's2_s4',
   's2_s5',
   's2_s6',
   's3_s4',
   's3_s5',
   's3_s6',
   's4_s5',
   's4_s6',
   's5_s6'],
  'Set': 'train',
  'Number_of_Data': 264,
  'MAE': 0.07091449968742602,
  'MSE': 0.010345175411974637,
  'RMSE': 0.1017112354264495,
  'R2': 0.9999982801644521},
 {'Model': 'XGBoost',
  'Features': ['age_degree_2',