In [1]:
from src.Models import MyModel

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import os
import optuna
import xgboost as xgb
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_values = pd.read_csv("./data/raw/train_values.csv")
train_labels = pd.read_csv("./data/raw/train_labels.csv")
test_values = pd.read_csv("./data/raw/test_values.csv")

# !!! DROP building_id !!!
train_values.drop(columns="building_id", inplace=True)
train_labels.drop(columns="building_id", inplace=True)
test_wo_id = test_values.drop(columns="building_id")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    train_values, train_labels, random_state=42, test_size=0.2
)

In [4]:
y_train_xgb = y_train - 1
y_test_xgb = y_test - 1

In [5]:
columns_to_target_encode = ["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"]
columns_to_label_encode = [
    "land_surface_condition",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status",
]

In [6]:
encoder = ColumnTransformer(
    transformers=[
        ("label_encode", OrdinalEncoder(), columns_to_label_encode),
        (
            "target",
            TargetEncoder(random_state=0, target_type="continuous"),
            columns_to_target_encode,
        ),
    ],
    remainder="passthrough",
)

In [7]:
X_train_encode = pd.DataFrame(encoder.fit_transform(X_train, y_train))
X_test_encode = pd.DataFrame(encoder.transform(X_test))

  y = column_or_1d(y, warn=True)


In [10]:
col_list = X_train.columns.to_list()
print(col_list)

['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'legal_ownership_status', 'count_families', 'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_seconda

In [11]:
X_train_encode = X_train_encode.set_axis([col_list], axis=1)
X_test_encode = X_test_encode.set_axis([col_list], axis=1)

In [12]:
X_train_encode.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,0.0,2.0,0.0,0.0,3.0,2.0,7.0,2.0,1.943512,1.659799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,0.0,1.0,3.0,2.0,2.0,2.168229,2.460637,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,0.0,0.0,1.0,2.0,2.0,3.0,2.296534,2.60535,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.300086,2.138924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,2.0,1.0,0.0,1.0,2.0,2.0,2.0,2.793108,2.945335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_test_encode.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,2.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.342467,2.4713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,2.0,0.0,0.0,1.0,2.0,2.0,2.0,1.997616,1.617221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,0.0,2.0,1.0,2.0,2.0,2.0,1.7324,1.52867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,2.0,1.0,0.0,1.0,2.0,2.0,2.0,2.47977,2.150456,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,2.0,0.0,0.0,3.0,2.0,2.0,2.0,2.165629,2.117845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train_encode, label=y_train_xgb)
    dvalid = xgb.DMatrix(X_test_encode, label=y_test)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"]
        )
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"]
        )
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = f1_score(y_true=y_test_xgb, y_pred=pred_labels, average="micro")
    return accuracy

In [34]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2023-10-07 11:54:35,334] A new study created in memory with name: no-name-a28f0ddf-3f9d-4cd5-b0a9-e0cdd636a20e


[I 2023-10-07 11:54:37,036] Trial 0 finished with value: 0.7249860900596689 and parameters: {'booster': 'gbtree', 'lambda': 0.3815424664767532, 'alpha': 0.004461137972679115, 'subsample': 0.4790212544700338, 'colsample_bytree': 0.27929236819329434, 'max_depth': 5, 'min_child_weight': 2, 'eta': 0.2840156458321894, 'gamma': 0.049471177408348006, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7249860900596689.
[I 2023-10-07 11:54:40,163] Trial 1 finished with value: 0.09919226415456342 and parameters: {'booster': 'dart', 'lambda': 5.795329891550321e-06, 'alpha': 2.5559166616905457e-05, 'subsample': 0.26418418426886814, 'colsample_bytree': 0.25050116556089724, 'max_depth': 5, 'min_child_weight': 6, 'eta': 1.490963028611285e-08, 'gamma': 0.0001152339610106158, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 2.4749780121332627e-06, 'skip_drop': 2.875195267518833e-07}. Best is trial 1 with value: 0.09919226415456342.
[I 2023-10-07 11:54

KeyboardInterrupt: 