In [81]:
import sys
from pathlib import Path

import numpy as np
import polars as pl
import lightgbm as lgb

work_path = "/home/haselab/Documents/tat/Research/"
sys.path.append(f"{work_path}app/torch_libs/")

from run_manager_new import RunManager, RunsManager, RunViewer

In [82]:
path_total = Path("/home/haselab/Documents/tat/Research/app/sakana_ml/level7_ml/Test_dat2_v2.csv")

df = pl.read_csv(path_total, infer_schema_length=1000)
# display(df)

# x, yともに適用する処理
# nullが10以上ある行をdrop
# df = df.with_columns(pl.Series([row.count(None) for row in df.iter_rows()]).alias("null_count")).filter(pl.col("null_count") < 10).drop("null_count")

# x, yそれぞれに適用する処理
x = df.select(df.columns[:-4])
x = x.select(~ pl.selectors.ends_with("records"))
x = x.select(~ pl.selectors.ends_with("ID"))
x = x.select(~ pl.selectors.ends_with("Scientific name"))
# x = x.select(~ pl.selectors.contains("max size"))
# nullがk行を以下のカラムのみ採用
# x = x.select([col_name for col_name in x.columns if df[col_name].null_count() <=100])
x = x.select([col_name for col_name in x.columns if df[col_name].null_count() <=100 or "associate" in col_name.lower()])

# df = df.select([col_name for col_name in df.columns if df[col_name].null_count() <= 100 or (df.get_column_index(col_name) - df.width) >= -4])


ys = df.select(df.columns[-3:])

# # BlankをNに置き換え
# ys = ys.with_columns(
#     pl.when(pl.all().is_null())
#     .then(pl.lit("N"))
#     .otherwise(pl.all())
#     .name.keep()
# )

# NとYを0, 1に置き換え
ys = ys.with_columns(
    pl.when(pl.all() == "N")
    .then(pl.lit(2))
    .when(pl.all() == "Y")
    .then(pl.lit(1))
    .otherwise(pl.lit(0))
    .name.keep()
)

# display(x)
# display(ys)

# display(x.columns)
# display(ys.columns)



In [83]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

# テストデータの評価
class Result:
    def __init__(self, val, proba):
        self.val = val
        self.proba = proba
        self.pred = proba.argmax(axis=1)

    def acc(self):
        return accuracy_score(self.val, self.pred)

    def f1(self):
        return f1_score(self.val, self.pred, average="macro") 

    def auc(self):
        return roc_auc_score(self.val, self.pred)
    
    def logloss(self):
        return log_loss(self.val, self.proba)

    def cm(self):
        return confusion_matrix(self.val, self.pred)

                # r = Result(y_val, y_proba)
                # s.add_met("acc", r.acc())
                # s.add_met("f1", r.f1())
                # s.add_met("auc", r.auc())
                # s.add_met("logloss", r.logloss())
                # s.add_met("cm", r.cm(), cm=True)

    # def acc(self):
    #     return getattr(self, "acc_tmp", self.acc_tmp := accuracy_score(self.val, self.pred))

    # def acc(self):
    #     attrname = "acc_tmp"
    #     try:
    #         return getattr(self, attrname)
    #     except:

    #         score = accuracy_score(self.val, self.pred)

    #         setattr(self, attrname, score)
    #         return getattr(self, attrname)


class Scores:
    def __init__(self):
        self.score_dict = dict()
        
    def __getitem__(self, index):
        return self.score_dict.setdefault(index, [])
    
    def __setitem__(self, key, value):
        self.score_dict[key] = value
        return self
        
    def __attr__(self, attr):
        return self.score_dict.attr

    def __or__(self, other):
        for k, v in other.score_dict.items():
            self.score_dict[k] = v
        return self
    
    def __str__(self):
        return str(self.score_dict)
    
    def fold(self, val_index_key, cm_key, proba_key):
        s = Scores()
        for k, v in self.score_dict.items():
            if k == val_index_key:
                pass
            elif k == cm_key:
                s[k].append(np.sum(v, axis=0))
            elif k == proba_key:
                num_datas = sum(len(sublist) for sublist in self.score_dict[val_index_key])
                proba = [None for _ in range(num_datas)]
                for val_part, proba_part in zip(self.score_dict[val_index_key], self.score_dict[proba_key]): 
                    for i, idx in enumerate(val_part):
                        proba[idx] = proba_part[i]
                s[k].append(proba)
            else:
                s[k].append(np.mean(v, axis=0))
        return s

    def ave(self):
        s = Scores()
        for k, v in self.score_dict.items():
            s[k].append(np.mean(v, axis=0))
            # if isinstance(v[0], np.ndarray):
            #     s[k].append(np.mean(v, axis=0))
            # else:
            #     s[k].append(sum(v) / len(v))
        return s

                    
            

        

    

In [86]:
import optuna
# Using scikit-learn API
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

import matplotlib.pyplot as plt
import seaborn as sns


y = ys.select(ys.columns[0]).to_series()

def objective(trial):
    params = {
        "boosting_type": "gbdt",
        'objective': 'binary',
        "n_estimators": 100,
        "learning_rate": 0.1,

        "max_depth": trial.suggest_int("max_depth", 1, 7), # 10
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 25, 60), # 31
        "subsample": trial.suggest_float("subsample", 0.4, 1.0), # 1.0 bagging_fracation

        "task": "train",
        'metric':'binary_logloss',
        'seed': 0,
        'verbosity': -1,
    }

    f = Scores()
    for ri in range(5):
        kf = KFold(n_splits=4, shuffle=True, random_state=ri)
        
        s = Scores()
        for fold, (train_indices, val_indices) in enumerate(kf.split(x)):
            x_train, x_val = x[train_indices], x[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]
            
            model = lgb.LGBMClassifier(**params)
            model.fit(x_train, y_train)

            y_proba = model.predict_proba(x_val)

            r = Result(y_val, y_proba)
            s["acc"].append(r.acc())
            s["f1"].append(r.f1())
            s["auc"].append(r.auc())
            s["logloss"].append(r.logloss())
            s["cm"].append(r.cm())
            s["proba"].append(r.proba)
            s["ind"].append(val_indices)
            s["feat_imp"].append(model.feature_importances_)

        f |= s.fold("ind", "cm", "proba")

    a = f.ave()


    return a["logloss"][0]








In [87]:
study = optuna.create_study(
    storage="sqlite:///db.sqlite3",
    study_name="Nyaaaaaaaaaaaa",
    direction='minimize',
    load_if_exists=True,
)
study.optimize(objective, n_trials=10)


params = {
    "boosting_type": "gbdt",
    'objective': 'binary',
    "n_estimators": 100,
    "learning_rate": 0.1,

    "task": "train",
    'metric':'binary_logloss',
    'seed': 0,
    'verbosity': -1,
}

params |= study.best_params

df_res_prob = None

f = Scores()
for ri in range(5):
    kf = KFold(n_splits=4, shuffle=True, random_state=ri)
    
    s = Scores()
    for fold, (train_indices, val_indices) in enumerate(kf.split(x)):
        x_train, x_val = x[train_indices], x[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(x_train, y_train)

        y_proba = model.predict_proba(x_val)

        r = Result(y_val, y_proba)
        s["acc"].append(r.acc())
        s["f1"].append(r.f1())
        s["auc"].append(r.auc())
        s["logloss"].append(r.logloss())
        s["cm"].append(r.cm())
        s["proba"].append(r.proba)
        s["ind"].append(val_indices)
        s["feat_imp"].append(model.feature_importances_)

    f |= s.fold("ind", "cm", "proba")

a = f.ave()


se_proba = pl.Series(a["proba"][0][:, 1]).rename(y.name + "_prob")
if df_res_prob is None:
    df_res_prob = se_proba.to_frame()
else:
    df_res_prob = df_res_prob.with_columns(se_proba)
    

df_res = df_res_prob.with_columns(
    pl.when(pl.all() < 0.5)
    .then(pl.lit("n"))
    .otherwise(pl.lit("y"))
    .name
    .map(lambda x: x[:-5] + "_pred")
)

cf = {c: f for c, f in zip(x.columns, model.feature_importances_)}
cf = {k: v for k, v in sorted(cf.items(), key=lambda item: item[1], reverse=True)}

for c, f in cf.items():
    print(f"{c:30}:{f}")

print(a["feat_imp"][0])
print(a["logloss"][0])
print(a["cm"][0])

fig, ax = plt.subplots()
fig.set_figwidth(3)
fig.set_figheight(2.25)

ax = sns.heatmap(cm, annot=True, cbar=True, square=True, fmt=".0f", cmap="Blues_r", xticklabels=list(range(cm.shape[0])), yticklabels=list(range(cm.shape[1])))
ax.set_xlabel("pred_label")
ax.set_ylabel("true_label")

df_pred = pl.concat([df, df_res], how="horizontal")

display(df_pred)
df_pred.write_csv("./prediction.csv")



[I 2024-04-08 13:48:18,253] Using an existing study with name 'Nyaaaaaaaaaaaa' instead of creating a new one.
[I 2024-04-08 13:48:18,758] Trial 40 finished with value: 0.3241787851813847 and parameters: {'max_depth': 3, 'colsample_bytree': 0.4304791130783576, 'num_leaves': 35, 'subsample': 0.8024424248763481}. Best is trial 17 with value: 0.30298630449461167.


0.3241787851813847
[[239.  29.]
 [ 27. 116.]]


[I 2024-04-08 13:48:19,193] Trial 41 finished with value: 0.3056947764665697 and parameters: {'max_depth': 2, 'colsample_bytree': 0.48814848835033114, 'num_leaves': 39, 'subsample': 0.9742872103826378}. Best is trial 17 with value: 0.30298630449461167.


0.3056947764665697
[[240.  28.]
 [ 27. 116.]]


[I 2024-04-08 13:48:19,645] Trial 42 finished with value: 0.30298630449461167 and parameters: {'max_depth': 2, 'colsample_bytree': 0.47332751737846307, 'num_leaves': 43, 'subsample': 0.9977069953098606}. Best is trial 17 with value: 0.30298630449461167.


0.30298630449461167
[[240.  28.]
 [ 28. 115.]]


[I 2024-04-08 13:48:20,059] Trial 43 finished with value: 0.31021838588152256 and parameters: {'max_depth': 1, 'colsample_bytree': 0.5578269696460478, 'num_leaves': 43, 'subsample': 0.9238662210958977}. Best is trial 17 with value: 0.30298630449461167.


0.31021838588152256
[[233.  35.]
 [ 23. 120.]]


[I 2024-04-08 13:48:20,501] Trial 44 finished with value: 0.30298630449461167 and parameters: {'max_depth': 2, 'colsample_bytree': 0.4663470190359627, 'num_leaves': 49, 'subsample': 0.9535917979064324}. Best is trial 17 with value: 0.30298630449461167.


0.30298630449461167
[[240.  28.]
 [ 28. 115.]]


[I 2024-04-08 13:48:20,960] Trial 45 finished with value: 0.30298630449461167 and parameters: {'max_depth': 2, 'colsample_bytree': 0.4619182318651223, 'num_leaves': 49, 'subsample': 0.9558777264210945}. Best is trial 17 with value: 0.30298630449461167.


0.30298630449461167
[[240.  28.]
 [ 28. 115.]]


[I 2024-04-08 13:48:21,454] Trial 46 finished with value: 0.3241787851813847 and parameters: {'max_depth': 3, 'colsample_bytree': 0.42118340617568495, 'num_leaves': 49, 'subsample': 0.4802139371958915}. Best is trial 17 with value: 0.30298630449461167.


0.3241787851813847
[[239.  29.]
 [ 27. 116.]]


[I 2024-04-08 13:48:21,876] Trial 47 finished with value: 0.31074410172956535 and parameters: {'max_depth': 1, 'colsample_bytree': 0.5088581125178572, 'num_leaves': 45, 'subsample': 0.915792451611389}. Best is trial 17 with value: 0.30298630449461167.


0.31074410172956535
[[234.  34.]
 [ 22. 121.]]


[I 2024-04-08 13:48:22,338] Trial 48 finished with value: 0.30298630449461167 and parameters: {'max_depth': 2, 'colsample_bytree': 0.4711794507390672, 'num_leaves': 55, 'subsample': 0.9412053032044172}. Best is trial 17 with value: 0.30298630449461167.


0.30298630449461167
[[240.  28.]
 [ 28. 115.]]


[I 2024-04-08 13:48:22,767] Trial 49 finished with value: 0.3089670927960361 and parameters: {'max_depth': 1, 'colsample_bytree': 0.54015312179383, 'num_leaves': 47, 'subsample': 0.8628655101509297}. Best is trial 17 with value: 0.30298630449461167.


0.3089670927960361
[[233.  35.]
 [ 23. 120.]]
max size0_midPoint            :57
NDepth_Bottom                 :24
Habitat0_Average              :21
Lat0_Top                      :16
Behavior0_Average             :14
Habitat0_Top                  :14
DDepth_Top                    :13
Behavior0_Bottom              :13
DDepth_Bottom                 :12
DDepth_midPoint               :12
Lon0_Top                      :12
Lat0_Bottom                   :10
Associate0_Bottom             :10
NDepth_midPoint               :9
Lon0_Bottom                   :9
Associate0_Top                :9
Associate0_Average            :5
Behavior0_Top                 :4
TemperatureT0_Average         :4
Habitat0_Bottom               :3
NDepth_Top                    :2
Salinity0_Bottom              :2
TemperatureT0_Bottom          :2
Salinity0_Average             :1
TemperatureT0_Top             :1
Salinity0_Top                 :0
Associate_Floating object     :0
Associate_Large pelagics      :0
[ 8.25 13.25  9.7

acceptedNameUsageID,Scientific name,Family_ID,Genus_ID,DDepth_Top,DDepth_Bottom,DDepth_midPoint,NDepth_Top,NDepth_Bottom,NDepth_midPoint,Lat0_Top,Lat0_Bottom,Lat0_midPoint,Lon0_Top,Lon0_Bottom,Lon0_midPoint,common size0_Top,common size0_Bottom,common size0_midPoint,max size0_Top,max size0_Bottom,max size0_midPoint,Temperature0_Top,Temperature0_Bottom,Temperature0_midPoint,Lon1_Top,Lon1_Bottom,Lon1_midPoint,max size1_Top,max size1_Bottom,max size1_midPoint,Lat1_Top,Lat1_Bottom,Lat1_midPoint,Depth2_Top,Depth2_Bottom,Depth2_midPoint,…,TemperatureT0_Average,TemperatureT0_# records,Aggregation0_Bottom,Aggregation0_Top,Aggregation0_Average,Aggregation0_# records,Associate0_Bottom,Associate0_Top,Associate0_Average,Associate_Floating object,Associate_Large pelagics,Behavior1_Bottom,Behavior1_Top,Behavior1_Average,Behavior1_# records,Food1_Bottom,Food1_Top,Food1_Average,Food1_# records,Habitat1_Bottom,Habitat1_Top,Habitat1_Average,Habitat1_# records,Salinity1_Bottom,Salinity1_Top,Salinity1_Average,Salinity1_# records,TemperatureT1_Bottom,TemperatureT1_Top,TemperatureT1_Average,TemperatureT1_# records,ICCAT Fisheries,LL,PS,Others,LL_prob,LL_pred
i64,str,i64,i64,i64,i64,f64,i64,i64,f64,f64,f64,str,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,str,str,f64,i64,i64,str,i64,i64,str,…,f64,i64,i64,i64,f64,i64,f64,f64,f64,i64,i64,i64,f64,f64,i64,i64,f64,f64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,str,str,str,f64,str
105787,"""Carcharhinus a…",105689,105719,0,810,150.0,0,810,150.0,-31.0,46.0,,-180.0,180.0,,,,250.0,,,300.0,,,,,,,,,,,,,,,,…,2.5,4,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,,0.922554,"""y"""
105788,"""Carcharhinus b…",105689,105719,0,200,50.0,0,200,50.0,-38.0,40.0,,-100.0,155.0,,,,250.0,,,300.0,,,,,,,,,,,,,,,,…,2.5,4,1,4,2.5,2,,,0.0,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,"""Y""",0.817731,"""y"""
105789,"""Carcharhinus f…",105689,105719,0,500,259.0,0,500,259.0,-43.0,42.0,,-180.0,180.0,,,,250.0,,,350.0,,,23.0,,,,,,,,,,,,,…,2.6,1,,,,,2.0,3.1,2.55,1,1,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""","""Y""","""Y""",0.796956,"""y"""
105790,"""Carcharhinus g…",105689,105719,0,285,105.0,0,285,105.0,-34.0,36.0,,-180.0,180.0,,,,300.0,,,370.0,,,,,,,,,,,,,,,,…,2.7,2,,,,,1.5,1.5,1.5,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,,0.915067,"""y"""
105791,"""Carcharhinus i…",105689,105719,0,20,10.0,0,20,10.0,-38.0,42.0,,-100.0,-12.0,,,,,,,,,,,,,,,,,,,,,,,…,2.6,1,4,4,4.0,1,,,0.0,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,,0.838951,"""y"""
105792,"""Carcharhinus l…",105689,105719,0,164,15.5,0,164,15.5,-39.0,42.0,,-180.0,180.0,,,,260.0,,,360.0,,,,,,,,,,,,,,,,…,2.666667,3,,,,,1.5,1.5,1.5,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""","""Y""","""Y""",0.811839,"""y"""
105793,"""Carcharhinus l…",105689,105719,0,140,15.0,0,140,15.0,-38.0,45.0,,-180.0,180.0,,,,150.0,,,286.0,,,,,,,,,,,,,,,,…,2.666667,3,5,5,5.0,1,,,0.0,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,"""Y""",0.758603,"""y"""
105794,"""Carcharhinus l…",105689,105719,0,1082,76.0,0,1082,76.0,-43.0,46.0,,-180.0,180.0,,,,270.0,,,400.0,18.0,28.0,,,,,,,,,,,,,,…,2.533333,3,,,,,2.3,2.8,2.55,1,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""","""Y""","""Y""",0.831139,"""y"""
105795,"""Carcharhinus m…",105689,105719,0,75,37.5,0,75,37.5,-25.0,35.0,,-180.0,-134.0,,,,,,,,,,,7,180,,,,,,,,,,,…,2.8,2,1,3,2.0,2,1.2,1.2,1.2,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,,0.465701,"""n"""
105796,"""Carcharhinus o…",105689,105719,0,500,300.0,0,500,300.0,-46.0,45.0,,-120.0,156.0,,,,250.0,,,420.0,,,,,,,,,,,,,,,,…,2.666667,3,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,"""Y""","""Y""",,"""Y""",0.924845,"""y"""


In [None]:
# Using scikit-learn API
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

import matplotlib.pyplot as plt
import seaborn as sns

params_list = []

# LL
params = {
    "boosting_type": "gbdt",
    "num_leaves": trial.suggest_int("num_leaves", 25, 60), # 31
    "max_depth": trial.suggest_int("max_depth", 1, 7), # 10
    "subsample": trial.suggest_float("subsample", 0.5, 1.0), # 1.0
    'objective': 'binary',
    "n_estimators": 100,
    "learning_rate": 0.1,

    "task": "train",
    'metric':'binary_logloss',
    'seed': 0,
    'verbosity': -1,
}
params_list.append(params)

# PS
params = {
    "boosting_type": "gbdt",
    "num_leaves": trial.suggest_int("num_leaves", 25, 60), # 31
    "max_depth": trial.suggest_int("max_depth", 1, 7), # 10
    "subsample": trial.suggest_float("subsample", 0.5, 1.0), # 1.0
    'objective': 'binary',
    "n_estimators": 100,
    "learning_rate": 0.1,

    "task": "train",
    'metric':'binary_logloss',
    'seed': 0,
    'verbosity': -1,
}
params_list.append(params)

# Others
params = {
    "boosting_type": "gbdt",
    "num_leaves": trial.suggest_int("num_leaves", 25, 60), # 31
    "max_depth": trial.suggest_int("max_depth", 1, 7), # 10
    "subsample": trial.suggest_float("subsample", 0.5, 1.0), # 1.0
    'objective': 'binary',
    "n_estimators": 100,
    "learning_rate": 0.1,

    "task": "train",
    'metric':'binary_logloss',
    'seed': 0,
    'verbosity': -1,
}
params_list.append(params)


df_res_prob = None
for y, params in zip(ys, params_list):

    f = Scores()
    for ri in range(5):
        kf = KFold(n_splits=4, shuffle=True, random_state=ri)
        
        s = Scores()
        for fold, (train_indices, val_indices) in enumerate(kf.split(x)):
            x_train, x_val = x[train_indices], x[val_indices]
            y_train, y_val = y[train_indices], y[val_indices]
            
            model = lgb.LGBMClassifier(**params)
            model.fit(x_train, y_train)

            y_proba = model.predict_proba(x_val)

            r = Result(y_val, y_proba)
            s["acc"].append(r.acc())
            s["f1"].append(r.f1())
            s["auc"].append(r.auc())
            s["logloss"].append(r.logloss())
            s["cm"].append(r.cm())
            s["proba"].append(r.proba)
            s["ind"].append(val_indices)
            s["feat_imp"].append(model.feature_importances_)

        f |= s.fold("ind", "cm", "proba")

    a = f.ave()

    fig, ax = plt.subplots()
    fig.set_figwidth(3)
    fig.set_figheight(2.25)

    ax = sns.heatmap(cm, annot=True, cbar=True, square=True, fmt=".0f", cmap="Blues_r", xticklabels=list(range(cm.shape[0])), yticklabels=list(range(cm.shape[1])))
    ax.set_xlabel("pred_label")
    ax.set_ylabel("true_label")

    se_proba = pl.Series(a["proba"][0][:, 1]).rename(y.name + "_prob")
    if df_res_prob is None:
        df_res_prob = se_proba.to_frame()
    else:
        df_res_prob = df_res_prob.with_columns(se_proba)
        
    # display(df_res_prob)

    df_res = df_res_prob.with_columns(
        pl.when(pl.all() < 0.5)
        .then(pl.lit("n"))
        .otherwise(pl.lit("y"))
        .name
        .map(lambda x: x[:-5] + "_pred")
    )

    cf = {c: f for c, f in zip(x.columns, model.feature_importances_)}
    cf = {k: v for k, v in sorted(cf.items(), key=lambda item: item[1], reverse=True)}

    for c, f in cf.items():
        print(f"{c:30}:{f}")

    print(a["feat_imp"][0])
    print(a["logloss"][0])
    print(a["cm"][0])



NameError: name 'trial' is not defined