In [2]:
import warnings
import pprint

import pandas as pd
import numpy as np

import lightgbm
import optuna
import optuna.visualization

from sklearn import set_config

from commons import (
    get_HPO_cross_validation_score,
    get_train_test_data,
    generate_cross_validation
)

optuna.logging.set_verbosity(optuna.logging.ERROR)
set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

In [3]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://ujjwal:oYQW1oN9YUPeNFW3@hpo.np6dub1.mongodb.net/?retryWrites=true&w=majority&appName=HPO"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)
    
config = list(client['HPO']['config'].find({'tag': 'baseline_v1.0'}))
assert len(config) == 1
config = config[0]
print("----> Config <------")
pprint.pprint(config)
model_collection = client['HPO']['model_outputs']

Pinged your deployment. You successfully connected to MongoDB!
----> Config <------
{'_id': ObjectId('66634faa730f244ce139e1e0'),
 'hpo': {'n_splits': 6, 'seed': 56, 'split_type': 'StratifiedKFold'},
 'model': {'n_repeats': 3,
           'n_splits': 10,
           'seed': 645671,
           'split_type': 'RepeatedStratifiedKFold'},
 'tag': 'baseline_v1.0'}


In [4]:
train, test = get_train_test_data(
    "../train.csv",
    "../test.csv",
    include_original=True,
)

Train shape: (76518, 37)
Original shape: (4424, 37)
Test shape: (51012, 37)


In [5]:
# map for apolication mode
application_mode1 = [57, 3, 9, 4, 26, 27]
application_mode2 = [35, 12]
train['application mode'] = train['application mode'].apply(lambda x: 2 if x in application_mode1 else x)
train['application mode'] = train['application mode'].apply(lambda x: 10 if x in application_mode2 else x)
test['application mode'] = test['application mode'].apply(lambda x: 2 if x in application_mode1 else x)
test['application mode'] = test['application mode'].apply(lambda x: 10 if x in application_mode2 else x)

# map for application order 0 to 1 and 9 to 6
train['application order'] = train['application order'].apply(lambda x: 1 if x == 0 else x)
train['application order'] = train['application order'].apply(lambda x: 6 if x == 9 else x)
test['application order'] = test['application order'].apply(lambda x: 1 if x == 0 else x)
test['application order'] = test['application order'].apply(lambda x: 6 if x == 9 else x)

# map for course
train['course'] = train['course'].apply(lambda x: 33 if x == 39 else x)
train['course'] = train['course'].apply(lambda x: 8014 if x == 979 else x)
test['course'] = test['course'].apply(lambda x: 33 if x == 39 else x)
test['course'] = test['course'].apply(lambda x: 8014 if x == 979 else x)

# map for curricular 1st sem
curricular_1st_sem_approved = [26, 20]
curricular_1st_sem_credited = [18, 19, 20]
curricular_1st_sem_enrolled = [22, 23, 26]
curricular_1st_sem_without_eval = [9, 10, 12]
train['curricular units 1st sem (approved)'] = train['curricular units 1st sem (approved)'].apply(lambda x: 20 if x in curricular_1st_sem_approved else x)
train['curricular units 1st sem (credited)'] = train['curricular units 1st sem (credited)'].apply(lambda x: 19 if x in curricular_1st_sem_credited else x)
train['curricular units 1st sem (enrolled)'] = train['curricular units 1st sem (enrolled)'].apply(lambda x: 20 if x in curricular_1st_sem_enrolled else x)
train['curricular units 1st sem (without evaluations)'] = train['curricular units 1st sem (without evaluations)'].apply(lambda x: 9 if x in curricular_1st_sem_without_eval else x)
test['curricular units 1st sem (approved)'] = test['curricular units 1st sem (approved)'].apply(lambda x: 20 if x in curricular_1st_sem_approved else x)
test['curricular units 1st sem (credited)'] = test['curricular units 1st sem (credited)'].apply(lambda x: 19 if x in curricular_1st_sem_credited else x)
test['curricular units 1st sem (enrolled)'] = test['curricular units 1st sem (enrolled)'].apply(lambda x: 20 if x in curricular_1st_sem_enrolled else x)
test['curricular units 1st sem (without evaluations)'] = test['curricular units 1st sem (without evaluations)'].apply(lambda x: 9 if x in curricular_1st_sem_without_eval else x)

# map for curricular 2nd sem
curricular_2nd_sem_enrolled = [18, 19, 21, 23]
train['curricular units 2nd sem (enrolled)'] = train['curricular units 2nd sem (enrolled)'].apply(lambda x: 18 if x in curricular_2nd_sem_enrolled else x)
test['curricular units 2nd sem (enrolled)'] = test['curricular units 2nd sem (enrolled)'].apply(lambda x: 18 if x in curricular_2nd_sem_enrolled else x)

In [6]:
train['1st sem failed'] = (train['curricular units 1st sem (grade)'] < 10.0).astype(int)
train['2nd sem failed'] = (train['curricular units 2nd sem (grade)'] < 10.0).astype(int)
test['1st sem failed'] = (test['curricular units 1st sem (grade)'] < 10.0).astype(int)
test['2nd sem failed'] = (test['curricular units 2nd sem (grade)'] < 10.0).astype(int)

language_groups = {
    1: 'Portuguese-speaking', 41: 'Portuguese-speaking', 21: 'Portuguese-speaking', 22: 'Portuguese-speaking',
    24: 'Portuguese-speaking', 25: 'Portuguese-speaking', 26: 'Portuguese-speaking', 6: 'Spanish-speaking',
    101: 'Spanish-speaking', 109: 'Spanish-speaking', 108: 'Spanish-speaking', 2: 'German-speaking',
    11: 'Italian-speaking', 13: 'Dutch-speaking', 14: 'English-speaking', 17: 'Lithuanian-speaking',
    32: 'Turkish-speaking', 62: 'Romanian-speaking', 100: 'Moldovan', 103: 'Ukrainian', 105: 'Russian-speaking'
}

train['language_group'] = train['nacionality'].map(language_groups)
test['language_group'] = test['nacionality'].map(language_groups)
language_group_freq = train['language_group'].value_counts(normalize=True)

train['language_group'] = train['language_group'].map(language_group_freq)
test['language_group'] = test['language_group'].map(language_group_freq)

In [7]:
course_freq = train['course'].value_counts()
martial_status_freq = train['marital status'].value_counts()
unemp_rate_freq = train['unemployment rate'].value_counts()

train['course_count'] = train['course'].map(course_freq)
train['marital status'] = train['marital status'].map(martial_status_freq)
train['unemployment rate'] = train['unemployment rate'].map(unemp_rate_freq)

In [10]:
X = train.drop("target", axis=1)
y = train["target"]


def lgbm_tree_HPO(trial: optuna.Trial, fixed_params: dict) -> float:
    search_params = {
        "num_leaves": trial.suggest_int("num_leaves", 100, 800),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 400),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 20.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 0.6),
        "max_delta_step": trial.suggest_float("max_delta_step", 1e-3, 20.0, log=True),
        "min_gain_to_split": trial.suggest_float(
            "min_gain_to_split", 1e-8, 1e-3, log=True
        ),
        "path_smooth": trial.suggest_float("path_smooth", 1e-5, 5.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
    }
    estimator = lightgbm.LGBMClassifier(**fixed_params, **search_params)
    return get_HPO_cross_validation_score(config=config, estimator=estimator, X=X, y=y)

In [11]:
lgbm_fixed_params = {
    "objective": "multiclass",
    "num_class": 3,
    "boosting_type": "gbdt",
    "data_sample_strategy": "bagging",
    "device": "cpu",
    "verbosity": -1,
    "random_state": 909,
    "feature_pre_filter": True,
    "force_row_wise": True,
}
tree_study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(
        n_ei_candidates=35,
        consider_endpoints=True,
        n_startup_trials=200,
        multivariate=True,
    ),
)
tree_study.optimize(
    lambda trial: lgbm_tree_HPO(trial, lgbm_fixed_params),
    n_trials=1000,
    timeout=10800,
    show_progress_bar=True,
)

print("---> LGBM TREE HPO done <---")
fig1 = optuna.visualization.plot_param_importances(tree_study)
fig2 = optuna.visualization.plot_parallel_coordinate(tree_study)

fig1.show()
fig2.show()

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
X = train.drop("target", axis=1)
y = train["target"]


def lgbm_boosting_HPO(trial: optuna.Trial, fixed_params: dict) -> float:
    search_params = {
        "learning_rate": trial.suggest_float("learning_rate", 3e-5, 5e-2, log=True),
        "subsample": trial.suggest_float("subsample", 0.80, 1.0),
        "bagging_freq": trial.suggest_float("bagging_freq", 1, 10),
    }
    estimator = lightgbm.LGBMClassifier(**fixed_params, **search_params)
    return get_HPO_cross_validation_score(config=config, estimator=estimator, X=X, y=y)

In [None]:
lgbm_params = {
    **lgbm_fixed_params,
    **tree_study.best_params,
    "n_estimators": 1800,
}

boosting_study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(multivariate=True, consider_endpoints=True, n_startup_trials=20),
)
boosting_study.optimize(
    lambda trial: lgbm_boosting_HPO(trial, lgbm_params),
    n_trials=100,
    timeout=18000,
    show_progress_bar=True
)