In [136]:
import numpy as np
import polars as pl

In [2]:
# 1 select case
# 2 cross validation (train only no hyper opt)
# 3 calculate mean prediction of case level 
# - accuracy 
# - precision
# - recall
# - F1

In [82]:
df = pl.read_csv("/home/surayuth/her2/extracted_features/linefeat_v4|scale_0.5|minhole_10|n_cell_10|n_cell_30|min_cont_0.1|min_area_hole_25.csv")
df = df \
    .with_columns(
        (pl.col("h_max") / (pl.col("n_cell_10") + 1e-8))
        .alias("hole_ratio")
    )

In [83]:
min_count = 10
max_count = 30

selected_df = (
        df \
        .with_columns(
            pl.len().over("case")
            .alias("count")
        ) 
        .filter(
            pl.col("count") >= min_count
        ) 
        .with_columns(
            pl.min_horizontal(max_count, pl.col("count"))
            .alias("cap_max")
        ) 
        .with_columns(
            pl.arange(1, pl.len() + 1).over("case")
            .alias("case_idx")
        ) 
        .filter(
            pl.col("case_idx") <= pl.col("cap_max")
        )
    )


case_df = selected_df.group_by("case").agg(pl.col("label").min())

In [142]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

hole_feat = ["hole_ratio"]
lbp_feat = [f"lbp{i}" for i in range(10)]
hara_feat = [
        "contrast", "dissim", "homo", "asm",
        "energy", "corrs", "entropy"
    ] 

selected_feat = ["hole", "lbp", "hara"]
features = []
if "hole" in selected_feat:
    features += hole_feat
if "lbp" in selected_feat:
    features += lbp_feat
if "hara" in selected_feat:
    features += hara_feat

for k in range(10):
    print(f"state: {k}")
    print("=" * 30)
    skf = StratifiedKFold(n_splits=4, random_state=k, shuffle=True)
    acc = []
    f1 = []
    auc = []
    for i, (train_index, test_index) in enumerate(skf.split(case_df.select("case"), case_df.select("label"))):
        train_case = case_df[train_index].select("case")
        test_case = case_df[test_index].select("case")
        train_df = selected_df.filter(pl.col("case").is_in(train_case)).select(*features, "label")
        test_df = selected_df.filter(pl.col("case").is_in(test_case)).select(*features, "label")

        X_train = train_df.drop("label").to_numpy()
        y_train = train_df.select("label").to_numpy().reshape(-1)

        X_test = test_df.drop("label").to_numpy()
        y_test = test_df.select("label").to_numpy().reshape(-1)

        #model = RandomForestClassifier(n_estimators=10)
        model = GradientBoostingClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        auc.append(roc_auc_score(y_test, y_pred))
    print(np.mean(acc), np.mean(f1), np.mean(auc))

state: 0
0.6812166329693291 0.7016356507415937 0.6795752202240168
state: 1
0.6902676141679636 0.7147843648168181 0.6889009581405421
state: 2
0.6654979716680263 0.6901657493539798 0.6644043416827845
state: 3
0.7052793353346558 0.7277035979991023 0.6975625895419123
state: 4
0.6879453631680925 0.7123185809253778 0.6867715840706015
state: 5
0.6963230068397375 0.7143805354350852 0.6954516810814662
state: 6
0.7051929083148017 0.7228709208069648 0.7026279152325983
state: 7
0.6968038587272511 0.7262816501499727 0.6935817179853527
state: 8
0.6992755407702819 0.7252143669179144 0.6992580576758353
state: 9
0.6699305645410356 0.6780668311991911 0.6756751153587593
