In [1]:
import numpy as np
import polars as pl

In [2]:
# 1 select case
# 2 cross validation (train only no hyper opt)
# 3 calculate mean prediction of case level 
# - accuracy 
# - precision
# - recall
# - F1

In [3]:
df = pl.read_csv("/home/surayuth/her2/extracted_features/mth_feat|level_4.csv")

In [4]:
min_count = 10
max_count = 30

selected_df = (
        df \
        .with_columns(
            pl.len().over("case")
            .alias("count")
        ) 
        .filter(
            pl.col("count") >= min_count
        ) 
        .with_columns(
            pl.min_horizontal(max_count, pl.col("count"))
            .alias("cap_max")
        ) 
        .with_columns(
            pl.arange(1, pl.len() + 1).over("case")
            .alias("case_idx")
        ) 
        .filter(
            pl.col("case_idx") <= pl.col("cap_max")
        )
    )


case_df = selected_df.group_by("case").agg(pl.col("label").min())

In [53]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

level = 4
mth = [f"mth{i+1}" for i in range(level)]
ratio_feat = ["strong_ratio", "med_ratio", "weak_ratio"]
color_feat = ["color_feat"]
lbp_feat = [f"lbp{i}" for i in range(10)]
hara_feat = [
        "contrast", "dissim", "homo", "asm",
        "energy", "corrs", "entropy"
    ] 

selected_feat = ["mth"]
features = []
if "color" in selected_feat:
    features += color_feat
if "lbp" in selected_feat:
    features += lbp_feat
if "hara" in selected_feat:
    features += hara_feat
if "mth" in selected_feat:
    features += mth
if "ratio" in selected_feat:
    features += ratio_feat

for k in range(10):
    print(f"state: {k}")
    print("=" * 30)
    skf = StratifiedKFold(n_splits=4, random_state=k, shuffle=True)
    acc = []
    f1 = []
    auc = []
    for i, (train_index, test_index) in enumerate(skf.split(case_df.select("case"), case_df.select("label"))):
        train_case = case_df[train_index].select("case")
        test_case = case_df[test_index].select("case")
        train_df = selected_df.filter(pl.col("case").is_in(train_case)).select(*features, "label")
        test_df = selected_df.filter(pl.col("case").is_in(test_case)).select(*features, "label", "case")

        X_train = train_df.drop("label").to_numpy()
        y_train = train_df.select("label").to_numpy().reshape(-1)

        X_test = test_df.drop("label", "case").to_numpy()
        y_test = test_df.select("label").to_numpy().reshape(-1)

        #model = RandomForestClassifier(n_estimators=10)
        model = GradientBoostingClassifier(random_state=0, n_estimators=50)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        agg_pred = test_df \
            .with_columns(
                pl.Series(y_pred)
                .alias("pred")
            ) \
            .group_by("case", "label") \
            .agg(
                pl.col("pred").mean()
            ) \
            .with_columns(
                pl.when(pl.col("pred") > 0.5).then(pl.lit(1))
                .otherwise(0)
                .alias("classify")
            ) 
        
        case_pred = agg_pred.select("classify").to_numpy().reshape(-1)
        case_label = agg_pred.select("label").to_numpy().reshape(-1)

        acc.append(accuracy_score(case_label, case_pred))
        f1.append(f1_score(case_label, case_pred))
        auc.append(roc_auc_score(case_label, case_pred))

    stats = np.round([np.mean(acc), np.mean(f1), np.mean(auc)], 4)
    print(*stats)

state: 0
0.8401 0.8612 0.8351
state: 1
0.788 0.8052 0.7873
state: 2
0.8306 0.8525 0.8248
state: 3
0.7871 0.8135 0.7811
state: 4
0.8197 0.8375 0.8152
state: 5
0.851 0.872 0.8455
state: 6
0.8401 0.8599 0.836
state: 7
0.8297 0.8468 0.8264
state: 8
0.8614 0.8837 0.855
state: 9
0.7976 0.8191 0.7925
