In [2]:
#!pip install lightgbm

In [16]:
import pandas as pd
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, precision_score, f1_score
from sklearn.preprocessing import StandardScaler

In [9]:
def get_metrics(y_true, y_predict):
    cm = confusion_matrix(y_true=y_true, y_pred=y_predict)
    precision = precision_score(y_true=y_true, y_pred=y_predict)
    f1_value = f1_score(y_true=y_true, y_pred=y_predict)

    return [cm, precision, f1_value]

In [4]:
df_data = pd.read_csv("../../process_dataset/examples_1009/data_for_training.csv")
df_data.head(5)

Unnamed: 0,age,bp_systolic,bp_diastolic,bp_pam,bp_pp,weight kg,height cm,our_imc,sex_1,sex_2,response
0,75.0,108,70,82.666667,38,70.0,160,27.3437,1,0,0
1,66.0,122,72,88.666667,50,52.0,149,23.4224,1,0,0
2,77.0,126,72,90.0,54,85.0,160,33.2031,1,0,0
3,73.0,130,80,96.666667,50,69.0,151,30.2618,1,0,0
4,65.0,129,73,91.666667,56,58.0,149,26.1249,1,0,0


In [5]:
pca_instance = load("../../results/rr_models/pca_instance.joblib")

pca_instance_2 = load("../../results/rr_models/2_pca.joblib")
kernel_instance_1 = load("../../results/rr_models/1_kernel.joblib")

In [6]:
response = df_data["response"]
df_data = df_data.drop(columns=["response"])

In [17]:
X_train, X_val, y_train, y_test = train_test_split(df_data, response, test_size=.2, random_state=42)

scaler_instance = StandardScaler()
scaler_instance.fit(X_train)
X_train_transform = scaler_instance.transform(X_train)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_transform, y_train)

metrics_rf_scale = get_metrics(y_test, rf_model.predict(scaler_instance.transform(X_val)))
metrics_rf_scale

[array([[27, 24],
        [18, 37]]),
 np.float64(0.6065573770491803),
 np.float64(0.6379310344827587)]

In [10]:
X_train, X_val, y_train, y_test = train_test_split(df_data, response, test_size=.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

metrics_rf_not_scale = get_metrics(y_test, rf_model.predict(X_val))
metrics_rf_not_scale

[array([[28, 23],
        [18, 37]]),
 np.float64(0.6166666666666667),
 np.float64(0.6434782608695652)]

In [11]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)

metrics_lgb_not_scale = get_metrics(y_test, lgb_model.predict(X_val))
metrics_lgb_not_scale

[LightGBM] [Info] Number of positive: 230, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 549
[LightGBM] [Info] Number of data points in the train set: 423, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.543735 -> initscore=0.175389
[LightGBM] [Info] Start training from score 0.175389


[array([[23, 28],
        [12, 43]]),
 np.float64(0.6056338028169014),
 np.float64(0.6825396825396826)]

In [12]:
df_data_transform = pca_instance.transform(df_data.values)

X_train, X_val, y_train, y_test = train_test_split(df_data_transform, response, test_size=.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

metrics_rf_not_scale = get_metrics(y_test, rf_model.predict(X_val))
metrics_rf_not_scale

[array([[30, 21],
        [19, 36]]),
 np.float64(0.631578947368421),
 np.float64(0.6428571428571429)]

In [13]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)

metrics_lgb_not_scale = get_metrics(y_test, lgb_model.predict(X_val))
metrics_lgb_not_scale

[LightGBM] [Info] Number of positive: 230, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1412
[LightGBM] [Info] Number of data points in the train set: 423, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.543735 -> initscore=0.175389
[LightGBM] [Info] Start training from score 0.175389


[array([[30, 21],
        [14, 41]]),
 np.float64(0.6612903225806451),
 np.float64(0.7008547008547008)]

In [14]:
df_data_transform_p1 = kernel_instance_1.transform(df_data.values)
df_data_transform_p2 = pca_instance_2.transform(df_data_transform_p1)

X_train, X_val, y_train, y_test = train_test_split(df_data_transform_p2, response, test_size=.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

metrics_rf_not_scale = get_metrics(y_test, rf_model.predict(X_val))
metrics_rf_not_scale

[array([[25, 26],
        [25, 30]]),
 np.float64(0.5357142857142857),
 np.float64(0.5405405405405406)]

In [15]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)

metrics_lgb_not_scale = get_metrics(y_test, lgb_model.predict(X_val))
metrics_lgb_not_scale

[LightGBM] [Info] Number of positive: 230, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 423, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.543735 -> initscore=0.175389
[LightGBM] [Info] Start training from score 0.175389


[array([[24, 27],
        [23, 32]]),
 np.float64(0.5423728813559322),
 np.float64(0.5614035087719298)]