In [None]:
import pandas as pd
import InsurAutoML
from InsurAutoML import load_data, AutoTabular
from InsurAutoML.utils import train_test_split

seed = 42
n_trials = 128
N_ESTIMATORS = 4
TIMEOUT = (n_trials / 4) * 450

InsurAutoML.set_seed(seed)

In [None]:
# load data
database = load_data(data_type = ".csv").load(path = "")
database_names = [*database]
database_names

In [None]:
database["ausprivauto"].head(5)

In [None]:
# define response/features
response = "ClaimOcc"
features = list(
    set(database["ausprivauto"].columns) - set(["ClaimOcc", "ClaimNb", "ClaimAmount"])
)
features.sort()

In [None]:
# train/test split
# first time running
train_X, test_X, train_y, test_y = train_test_split(
    database['ausprivauto'][features], database['ausprivauto'][[response]], test_perc = 0.1, seed = seed
)
pd.DataFrame(train_X.index.sort_values()).to_csv("train_index.csv", index=False)
# Use the same train/test split across all models for 2+ runs
# train_idx = pd.read_csv("train_index.csv", header=None).values.flatten()
# test_idx = database["ausprivauto"].index.difference(train_idx)
# train_X, test_X, train_y, test_y = (
#     database["ausprivauto"].loc[train_idx, features],
#     database["ausprivauto"].loc[test_idx, features],
#     database["ausprivauto"].loc[train_idx, response],
#     database["ausprivauto"].loc[test_idx, response],
# )

In [None]:
# fit AutoML model
mol = AutoTabular(
    model_name="ausprivauto_occ_{}".format(n_trials),
    max_evals=n_trials,
    n_estimators=N_ESTIMATORS,
    timeout=TIMEOUT,
    validation="KFold",
    valid_size=0.25,
    search_algo="Optuna",
    objective="AUC",
    cpu_threads=12,
    seed=seed,
)
mol.fit(train_X, train_y)

In [None]:
from sklearn.metrics import roc_auc_score

y_train_pred = mol.predict_proba(train_X)
y_test_pred = mol.predict_proba(test_X)
roc_auc_score(train_y.values, y_train_pred["class_1"].values), roc_auc_score(test_y.values, y_test_pred["class_1"].values)