In [1]:
from pathlib import Path

import numpy as np
import polars as pl
import lightgbm as lgb

In [2]:
path_total = Path("/home/haselab/Documents/tat/Research/app/sakana_ml/level7_ml/Test_dat2_v2.csv")

df = pl.read_csv(path_total, infer_schema_length=1000)
# display(df)

# x, yともに適用する処理
# nullが10以上ある行をdrop
# df = df.with_columns(pl.Series([row.count(None) for row in df.iter_rows()]).alias("null_count")).filter(pl.col("null_count") < 10).drop("null_count")

# x, yそれぞれに適用する処理
x = df.select(df.columns[:-4])
x = x.select(~ pl.selectors.ends_with("records"))
x = x.select(~ pl.selectors.ends_with("ID"))
x = x.select(~ pl.selectors.ends_with("Scientific name"))
# x = x.select(~ pl.selectors.contains("max size"))
# nullがk行を以下のカラムのみ採用
# x = x.select([col_name for col_name in x.columns if df[col_name].null_count() <=100])
x = x.select([col_name for col_name in x.columns if df[col_name].null_count() <=100 or "associate" in col_name.lower()])

# df = df.select([col_name for col_name in df.columns if df[col_name].null_count() <= 100 or (df.get_column_index(col_name) - df.width) >= -4])


ys = df.select(df.columns[-4:])
ys = ys.select(ys.columns[-3:])
ys = ys.select(ys.columns[0])


# # BlankをNに置き換え
# ys = ys.with_columns(
#     pl.when(pl.all().is_null())
#     .then(pl.lit("N"))
#     .otherwise(pl.all())
#     .name.keep()
# )

# NとYを0, 1に置き換え
ys = ys.with_columns(
    pl.when(pl.all() == "N")
    .then(pl.lit(2))
    .when(pl.all() == "Y")
    .then(pl.lit(1))
    .otherwise(pl.lit(0))
    .name.keep()
)

# display(x)
# display(ys)

display(x.columns)
# display(ys.columns)

# ys.write_csv("./tmp.csv")



['DDepth_Top',
 'DDepth_Bottom',
 'DDepth_midPoint',
 'NDepth_Top',
 'NDepth_Bottom',
 'NDepth_midPoint',
 'Lat0_Top',
 'Lat0_Bottom',
 'Lon0_Top',
 'Lon0_Bottom',
 'max size0_midPoint',
 'Behavior0_Bottom',
 'Behavior0_Top',
 'Behavior0_Average',
 'Habitat0_Bottom',
 'Habitat0_Top',
 'Habitat0_Average',
 'Salinity0_Bottom',
 'Salinity0_Top',
 'Salinity0_Average',
 'TemperatureT0_Bottom',
 'TemperatureT0_Top',
 'TemperatureT0_Average',
 'Associate0_Bottom',
 'Associate0_Top',
 'Associate0_Average',
 'Associate_Floating object',
 'Associate_Large pelagics']

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

# テストデータの評価
class Result:
    def __init__(self, val, proba):
        self.val = val
        self.proba = proba
        self.pred = proba.argmax(axis=1)

    def acc(self):
        return accuracy_score(self.val, self.pred)

    def f1(self):
        return f1_score(self.val, self.pred, average="macro") 

    def auc(self):
        return roc_auc_score(self.val, self.pred)
    
    def logloss(self):
        return log_loss(self.val, self.proba)

    def cm(self):
        return confusion_matrix(self.val, self.pred)

                # r = Result(y_val, y_proba)
                # s.add_met("acc", r.acc())
                # s.add_met("f1", r.f1())
                # s.add_met("auc", r.auc())
                # s.add_met("logloss", r.logloss())
                # s.add_met("cm", r.cm(), cm=True)

    # def acc(self):
    #     return getattr(self, "acc_tmp", self.acc_tmp := accuracy_score(self.val, self.pred))

    # def acc(self):
    #     attrname = "acc_tmp"
    #     try:
    #         return getattr(self, attrname)
    #     except:

    #         score = accuracy_score(self.val, self.pred)

    #         setattr(self, attrname, score)
    #         return getattr(self, attrname)


class Scores:
    def __init__(self):
        self.score_dict = {}
        self.cm_dict = {}
        self.proba_dict = {}
        
    def add_met(self, met_name, met_val, cm=False, proba=False):
        # if cm:
        #     ob_dict = self.cm_dict
        # elif proba:
        #     ob_dict = self.proba_dict
        # else:
        #     ob_dict = self.score_dict

        # if ob_dict.get(met_name) is None:
        #     ob_dict[met_name] = [met_val]
        # else:
        #     ob_dict[met_name].append(met_val)

        if cm:
            self.add_dict(self.cm_dict, met_name, met_val)
        elif proba:
            self.add_dict(self.proba_dict, met_name, met_val)
        else:
            self.add_dict(self.score_dict, met_name, met_val)

    # def ave_met(self, met_name):
    #     met_dict = self.score_dict | self.cm_dict | self.proba_dict

    #     return met_dict[met_name]
    
    def add_dict(self, dict_, met_name, met_val):
        if dict_.get(met_name) is None:
            dict_[met_name] = [met_val]
        else:
            dict_[met_name].append(met_val)

    def ave_mets(self):
        s = Scores()
        for k, v in self.score_dict.items():
            s.add_met(k, sum(v) / len(v))
        for k, v in self.cm_dict.items():
            s.add_met(k, sum(v) / len(v), cm=True)
        for k, v in self.proba_dict.items():
            s.add_met(k, np.stack(v).mean(axis=0), proba=True)

        return s

    def fold_mets(self):
        s = Scores()
        for k, v in self.score_dict.items():
            s.add_met(k, sum(v) / len(v))
        for k, v in self.cm_dict.items():
            s.add_met(k, sum(v), cm=True)
        for k, v in self.proba_dict.items():
            s.add_met(k, np.stack(v).mean(axis=0), proba=True)

        return s

    def print_mets(self):
        met_dict = self.score_dict | self.cm_dict | self.proba_dict
        for k, v in met_dict.items():
            print(f"{k:8} = {v[0]}")
            
    def __or__(self, other):
        for k, v in other.score_dict.items():
            self.add_met(k, *v)
        for k, v in other.cm_dict.items():
            self.add_met(k, *v, cm=True)
        for k, v in other.proba_dict.items():
            self.add_met(k, *v, proba=True)
            
        return self
    
    def __getitem__(self, index):
        met_dict = self.score_dict | self.cm_dict | self.proba_dict
        
        return met_dict[index]
        


    

In [4]:
import optuna
# Using scikit-learn API
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

import matplotlib.pyplot as plt
import seaborn as sns


def objective(trial):
    # num_leaves = trial.suggest_uniform('num_leaves', 25, 36)
    # max_depth = trial.suggest_uniform('num_leaves', 25, 36)

    params = {
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 25, 60), # 31
        "max_depth": trial.suggest_int("max_depth", 1, 7), # 10
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0), # 1.0
        'objective': 'binary',
        "n_estimators": 100,
        "learning_rate": 0.1,

        "task": "train",
        'metric':'binary_logloss',
        'seed': 0,
        'verbosity': -1,
    }

    df_res_prob = None

    for y in ys:

        f = Scores()
        for ri in range(5):
            kf = KFold(n_splits=4, shuffle=True, random_state=ri)
            proba = [None for i in range(len(x))]
            
            # proba管理用に一時的に
            met_dict = dict()

            s = Scores()
            for fold, (train_indices, val_indices) in enumerate(kf.split(x)):
                x_train, x_val = x[train_indices], x[val_indices]
                y_train, y_val = y[train_indices], y[val_indices]
                
                # x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.75, random_state=0)
                # print(f"{len(x_train)=}, {len(x_val)=}")

                model = lgb.LGBMClassifier(**params)
                model.fit(x_train, y_train)

                # y_pred_tmp = model.predict(x_val)
                y_proba = model.predict_proba(x_val)
                y_pred = y_proba.argmax(axis=1)

                r = Result(y_val, y_proba)
                s.add_met("acc", r.acc())
                s.add_met("f1", r.f1())
                s.add_met("auc", r.auc())
                s.add_met("logloss", r.logloss())
                s.add_met("cm", r.cm(), cm=True)
                # s.add_met("proba", r.proba, proba=True)

                for i, idx in enumerate(val_indices):
                    proba[idx] = y_proba[i]
            proba = np.stack(proba)

            if met_dict.get("proba") is None:
                met_dict["proba"] = [proba]
            else:
                met_dict["proba"].append(proba)

            f |= s.fold_mets()
        # print(model.feature_importances_)

        a = f.ave_mets()
        # print(a.score_dict["f1"])
        print(a.score_dict["logloss"])

        cm = a.cm_dict["cm"][0]
        # a.print_mets()

        # fig, ax = plt.subplots()
        # fig.set_figwidth(3)
        # fig.set_figheight(2.25)

        # ax = sns.heatmap(cm, annot=True, cbar=True, square=True, fmt=".0f", cmap="Blues_r", xticklabels=list(range(cm.shape[0])), yticklabels=list(range(cm.shape[1])))
        # ax.set_xlabel("pred_label")
        # ax.set_ylabel("true_label")
        
        se_proba = pl.Series(proba[:, 1]).rename(y.name + "_prob")
        if df_res_prob is None:
            df_res_prob = se_proba.to_frame()
        else:
            df_res_prob = df_res_prob.with_columns(se_proba)
            
            
# display(df_res_prob)

    df_res = df_res_prob.with_columns(
        pl.when(pl.all() < 0.5)
        .then(pl.lit("N"))
        .otherwise(pl.lit("Y"))
        .name
        .map(lambda x: x[:-5] + "_pred")
    )

# display(df_pred)

# print(x.columns)
# print(model.feature_importances_)


    # cf = {c: f for c, f in zip(x.columns, model.feature_importances_)}
    # cf = {k: v for k, v in sorted(cf.items(), key=lambda item: item[1], reverse=True)}

    # for c, f in cf.items():
        # print(f"{c:30}:{f}")

    return a.score_dict["f1"]
    # return a.score_dict["logloss"]

if __name__ == '__main__':
    study = optuna.create_study(
        storage="sqlite:///db.sqlite3",
        study_name="Nyaaaaaaaaaaaa",
        direction='minimize',
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=100)






ModuleNotFoundError: No module named 'optuna'

In [None]:
# # Using scikit-learn API
# from sklearn.model_selection import train_test_split, KFold
# from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss

# import matplotlib.pyplot as plt
# import seaborn as sns


# params = {
#     "boosting_type": "gbdt",
#     "num_leaves": 31,
#     "max_depth": 10,
#     'objective': 'binary',
#     "n_estimators": 100,
#     "learning_rate": 0.1,

#     "task": "train",
#     'metric':'binary_logloss',
#     'seed': 0,
#     'verbosity': -1,
# }

# df_res_prob = None

# for y in ys:

#     f = Scores()
#     for ri in range(5):
#         kf = KFold(n_splits=4, shuffle=True, random_state=ri)
#         proba = [None for i in range(len(x))]
        
#         # proba管理用に一時的に
#         met_dict = dict()

#         s = Scores()
#         for fold, (train_indices, val_indices) in enumerate(kf.split(x)):
#             x_train, x_val = x[train_indices], x[val_indices]
#             y_train, y_val = y[train_indices], y[val_indices]
            
#             # x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.75, random_state=0)
#             # print(f"{len(x_train)=}, {len(x_val)=}")

#             model = lgb.LGBMClassifier(**params)
#             model.fit(x_train, y_train)

#             # y_pred_tmp = model.predict(x_val)
#             y_proba = model.predict_proba(x_val)
#             y_pred = y_proba.argmax(axis=1)

#             r = Result(y_val, y_proba)
#             s.add_met("acc", r.acc())
#             s.add_met("f1", r.f1())
#             s.add_met("auc", r.auc())
#             s.add_met("logloss", r.logloss())
#             s.add_met("cm", r.cm(), cm=True)
#             # s.add_met("proba", r.proba, proba=True)

#             for i, idx in enumerate(val_indices):
#                 proba[idx] = y_proba[i]
#         proba = np.stack(proba)

#         if met_dict.get("proba") is None:
#             met_dict["proba"] = [proba]
#         else:
#             met_dict["proba"].append(proba)

#         f |= s.fold_mets()
#     # print(model.feature_importances_)

#     a = f.ave_mets()
#     print(a.score_dict["logloss"])

#     cm = a.cm_dict["cm"][0]
#     # a.print_mets()

#     fig, ax = plt.subplots()
#     fig.set_figwidth(3)
#     fig.set_figheight(2.25)

#     ax = sns.heatmap(cm, annot=True, cbar=True, square=True, fmt=".0f", cmap="Blues_r", xticklabels=list(range(cm.shape[0])), yticklabels=list(range(cm.shape[1])))
#     ax.set_xlabel("pred_label")
#     ax.set_ylabel("true_label")
    
#     se_proba = pl.Series(proba[:, 1]).rename(y.name + "_prob")
#     if df_res_prob is None:
#         df_res_prob = se_proba.to_frame()
#     else:
#         df_res_prob = df_res_prob.with_columns(se_proba)
        
        
# # display(df_res_prob)

# df_res = df_res_prob.with_columns(
#     pl.when(pl.all() < 0.5)
#     .then(pl.lit("N"))
#     .otherwise(pl.lit("Y"))
#     .name
#     .map(lambda x: x[:-5] + "_pred")
# )

# # display(df_pred)

# # print(x.columns)
# # print(model.feature_importances_)

# cf = {c: f for c, f in zip(x.columns, model.feature_importances_)}
# cf = {k: v for k, v in sorted(cf.items(), key=lambda item: item[1], reverse=True)}

# pass

# # for c, f in cf.items():
#     # print(f"{c:30}:{f}")




