In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes, load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from Model_Training_and_Evaluation_Flow import modelTrainingFlow
from FT_D_Pipeline import ML_Pipeline
from PermutationImportance import permutation_importance

# 實作範例 - Breast Cancer

In [2]:
rawData = load_breast_cancer(as_frame = True)["data"]
rawData = pd.concat([rawData, load_breast_cancer(as_frame = True)["target"]], axis = 1)
rawData = rawData.rename(
    columns = {
        i: i.replace(" ", "_") for i in rawData.columns
    }
)

In [3]:
trainData, testData = train_test_split(rawData, test_size = 0.2, shuffle = True) 
trainData, valiData = train_test_split(trainData, test_size = 0.25, shuffle = True) 
trainData, valiData, testData = trainData.reset_index(drop = True), valiData.reset_index(drop = True), testData.reset_index(drop = True) 

In [4]:
totalResult = modelTrainingFlow(trainData = trainData,
                                                 valiData = valiData,
                                                 testData = testData,
                                                inputFeatures = trainData.drop(columns = ["target"]).columns.tolist(), 
                                                target = "target", 
                                                targetType = "classification",
                                                mainMetric = "accuracy", 
                                                featureSelection = None, 
                                                featureImportance = None,
                                                modelFileName = "BreastCancer")
result = totalResult.fit(permutationImportance = True)

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

In [5]:
pd.DataFrame(result[1])

Unnamed: 0,Feature,Mean-Importance,Std-Importance,p-value for t-test,originalMetric,Metric_for_Each_Data,Importance_for_Each_Data
0,mean_radius,0.004106,0.002346,0.024896,0.958944,"[0.9648093841642229, 0.9648093841642229, 0.964...","[0.0058651026392961825, 0.0058651026392961825,..."
1,mean_texture,-0.003519,0.002195,0.032678,0.958944,"[0.9530791788856305, 0.9560117302052786, 0.953...","[-0.0058651026392961825, -0.002932551319648091..."
2,mean_perimeter,0.004692,0.002346,0.016130,0.958944,"[0.967741935483871, 0.9618768328445748, 0.9618...","[0.008797653958944274, 0.0029325513196480912, ..."
3,mean_area,0.001173,0.003978,0.587050,0.958944,"[0.9618768328445748, 0.9589442815249267, 0.961...","[0.0029325513196480912, 0.0, 0.002932551319648..."
4,mean_smoothness,-0.004106,0.001437,0.004636,0.958944,"[0.9560117302052786, 0.9560117302052786, 0.953...","[-0.0029325513196480912, -0.002932551319648091..."
...,...,...,...,...,...,...,...
625,worst_compactness,0.000000,0.000000,,0.973684,"[0.9736842105263158, 0.9736842105263158, 0.973...","[0.0, 0.0, 0.0, 0.0, 0.0]"
626,worst_concavity,0.005263,0.004297,0.070484,0.973684,"[0.9736842105263158, 0.9824561403508771, 0.982...","[0.0, 0.00877192982456132, 0.00877192982456132..."
627,worst_concave_points,0.003509,0.004297,0.177808,0.973684,"[0.9736842105263158, 0.9736842105263158, 0.982...","[0.0, 0.0, 0.00877192982456132, 0.008771929824..."
628,worst_symmetry,0.010526,0.006564,0.032678,0.973684,"[0.9912280701754386, 0.9912280701754386, 0.982...","[0.01754385964912275, 0.01754385964912275, 0.0..."


# 實作範例 - Iris Datasets

In [2]:
rawData = load_iris(as_frame = True)["data"]
rawData = pd.concat([rawData, load_iris(as_frame = True)["target"]], axis = 1)
trainData, testData = train_test_split(rawData, test_size = 0.2, shuffle = True) 
trainData, valiData = train_test_split(trainData, test_size = 0.25, shuffle = True) 
trainData, valiData, testData = trainData.reset_index(drop = True), valiData.reset_index(drop = True), testData.reset_index(drop = True) 

In [3]:
ml = ML_Pipeline(ml_methods = ["SMOTE", "standardization", "PCA"], inputFeatures = trainData.columns.tolist()[:-1], target = "target")
ml.fit_Pipeline(fit_data = trainData)
trainData, valiData, testData = [
    ml.transform_Pipeline(transform_data = j, mode = i) for i, j in zip(["train", "vali", "test"], [trainData, valiData, testData])
] 

In [4]:
totalResult = modelTrainingFlow(trainData = trainData,
                                                 valiData = valiData,
                                                 testData = testData,
                                                inputFeatures = trainData.drop(columns = ["target"]).columns.tolist(), 
                                                target = "target", 
                                                targetType = "classification",
                                                mainMetric = "accuracy", 
                                                featureSelection = None, 
                                                featureImportance = None,
                                                modelFileName = None)
result = totalResult.fit()


In [5]:
pd.DataFrame(result)

Unnamed: 0,Model,Features,Set,Number_of_Data,0_F1-Score_for_1,0_F1-Score_for_0,0_Macro F1-Score,0_Micro F1-Score,0_prc_auc_1,0_prc_auc_0,...,2_Recall_for_0,2_Macro Recall,2_Micro Recall,2_Accuracy,2_ROC-AUC,2_fpr,2_tpr,2_True_value,2_Predict_value,2_Predict_prob_value
0,Random Forest with Entropy,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",1.0,1.0,1.0,1.0,1.0,0.426129,...,0.5,0.75,0.666667,0.666667,0.963867,"[0.0, 0.015625, 0.0625, 0.09375, 0.109375, 0.1...","[0.0, 0.625, 0.84375, 0.875, 0.9375, 0.9375, 0...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, ...","[0.1665374977395472, 0.4257362140478936, 0.166..."
1,Random Forest with Gini,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",0.5,0.0,0.25,0.333333,0.666667,0.833333,...,1.0,0.5,0.666667,0.666667,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.333629156312859, 0.333629156312859, 0.33362..."
2,ExtraTree with Entropy,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",0.5,0.0,0.25,0.333333,0.666667,0.833333,...,1.0,0.5,0.666667,0.666667,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3333333333333333, 0.3333333333333333, 0.333..."
3,ExtraTree with Gini,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",0.5,0.0,0.25,0.333333,0.666667,0.833333,...,1.0,0.5,0.666667,0.666667,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3333333333333333, 0.3333333333333333, 0.333..."
4,XGBoost,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",1.0,1.0,1.0,1.0,1.0,0.450658,...,1.0,0.984375,0.989583,0.989583,0.999512,"[0.0, 0.0, 0.0, 0.015625, 0.015625, 1.0]","[0.0, 0.03125, 0.96875, 0.96875, 1.0, 1.0]","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.0060221911408007145, 0.9949139356613159, 0...."
5,LightGBM,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",1.0,1.0,1.0,1.0,1.0,0.450652,...,0.984375,0.976562,0.979167,0.979167,0.996582,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.109...","[0.0, 0.0625, 0.125, 0.3125, 0.4375, 0.75, 0.8...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.0004927959954134865, 0.9913617742133328, 0...."
6,LightGBM with ExtraTrees,"[sepal length (cm), sepal width (cm), petal le...",train,"{0: 32, 2: 32, 1: 32}",1.0,1.0,1.0,1.0,1.0,0.450658,...,0.96875,0.953125,0.958333,0.958333,0.996094,"[0.0, 0.0, 0.0, 0.03125, 0.03125, 0.09375, 0.0...","[0.0, 0.03125, 0.9375, 0.9375, 0.96875, 0.9687...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0.0033421354816008655, 0.993394055091279, 0.0..."
7,Random Forest with Entropy,"[sepal length (cm), sepal width (cm), petal le...",vali,"{0: 12, 2: 10, 1: 8}",1.0,1.0,1.0,1.0,1.0,0.366667,...,0.6,0.8,0.733333,0.733333,1.0,"[0.0, 0.0, 0.4, 1.0]","[0.0, 1.0, 1.0, 1.0]","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...","[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, ...","[0.16653749773954724, 0.16653749773954724, 0.4..."
8,Random Forest with Gini,"[sepal length (cm), sepal width (cm), petal le...",vali,"{0: 12, 2: 10, 1: 8}",0.571429,0.0,0.285714,0.4,0.7,0.8,...,1.0,0.5,0.666667,0.666667,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.333629156312859, 0.333629156312859, 0.33362..."
9,ExtraTree with Entropy,"[sepal length (cm), sepal width (cm), petal le...",vali,"{0: 12, 2: 10, 1: 8}",0.571429,0.0,0.285714,0.4,0.7,0.8,...,1.0,0.5,0.666667,0.666667,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3333333333333333, 0.3333333333333333, 0.333..."


# 實作範例 - Diabete Datasets

In [2]:
rawData = load_diabetes(as_frame = True)["data"]
rawData = pd.concat([rawData, load_diabetes(as_frame = True)["target"]], axis = 1)
trainData, testData = train_test_split(rawData, test_size = 0.2, shuffle = True) 
trainData, valiData = train_test_split(trainData, test_size = 0.25, shuffle = True) 
trainData, valiData, testData = trainData.reset_index(drop = True), valiData.reset_index(drop = True), testData.reset_index(drop = True) 

## 特徵轉換、降維或核函數

In [3]:
ml = ML_Pipeline(ml_methods = ["standardization", "PCA"], inputFeatures = trainData.columns.tolist()[:-1], target = "target")
ml.fit_Pipeline(fit_data = trainData)
trainData, valiData, testData = [
    ml.transform_Pipeline(transform_data = j, mode = i) for i, j in zip(["train", "vali", "test"], [trainData, valiData, testData])
] 

## 模型訓練

In [4]:
totalResult = modelTrainingFlow(trainData = trainData,
                                                 valiData = valiData,
                                                 testData = testData,
                                                inputFeatures = trainData.drop(columns = ["target"]).columns.tolist(), 
                                                target = "target", 
                                                targetType = "regression",
                                                mainMetric = "mse", 
                                                featureSelection = None, 
                                                featureImportance = None,
                                                modelFileName = None)
totalResult.fit()


[{'Model': 'Random Forest with squared_error',
  'Features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'],
  'Set': 'train',
  'Number_of_Data': 264,
  'MAE': 49.18586289161202,
  'MSE': 3373.817728613522,
  'RMSE': 58.08457392986129,
  'R2': 0.43314810540735227},
 {'Model': 'Random Forest with absolute_error',
  'Features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'],
  'Set': 'train',
  'Number_of_Data': 264,
  'MAE': 43.77231600346468,
  'MSE': 2868.8873245337227,
  'RMSE': 53.561995150794395,
  'R2': 0.5179839736176033},
 {'Model': 'Random Forest with friedman_mse',
  'Features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'],
  'Set': 'train',
  'Number_of_Data': 264,
  'MAE': 53.2675601994167,
  'MSE': 3926.5675964220313,
  'RMSE': 62.66232996324052,
  'R2': 0.34027785128966703},
 {'Model': 'ExtraTree with squared_error',
  'Features': ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'],
  'Set': 'train',
  'Numb

In [None]:
# 模型評估結果
totalResult[0][3]

{'Model': 'LightGBM with ExtraTrees',
 'Features': ['age_degree_2',
  'sex_degree_2',
  'bmi_degree_2',
  'bp_degree_2',
  's1_degree_2',
  's2_degree_2',
  's3_degree_2',
  's4_degree_2',
  's5_degree_2',
  's6_degree_2',
  'age_sex',
  'age_bmi',
  'age_bp',
  'age_s1',
  'age_s2',
  'age_s3',
  'age_s4',
  'age_s5',
  'age_s6',
  'sex_bmi',
  'sex_bp',
  'sex_s1',
  'sex_s2',
  'sex_s3',
  'sex_s4',
  'sex_s5',
  'sex_s6',
  'bmi_bp',
  'bmi_s1',
  'bmi_s2',
  'bmi_s3',
  'bmi_s4',
  'bmi_s5',
  'bmi_s6',
  'bp_s1',
  'bp_s2',
  'bp_s3',
  'bp_s4',
  'bp_s5',
  'bp_s6',
  's1_s2',
  's1_s3',
  's1_s4',
  's1_s5',
  's1_s6',
  's2_s3',
  's2_s4',
  's2_s5',
  's2_s6',
  's3_s4',
  's3_s5',
  's3_s6',
  's4_s5',
  's4_s6',
  's5_s6'],
 'Set': 'train',
 'Number_of_Data': 264,
 'MAE': 19.99369348797126,
 'MSE': 590.3199250115812,
 'RMSE': 24.296500262621798,
 'R2': 0.9016171230378656}