In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer

In [6]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [8]:
data = pd.read_csv('/content/drive/MyDrive/bank.csv', sep=';')

In [9]:
y = data[['y']].apply(lambda x: 1 if x.y == 'yes' else 0, axis=1)
X = data.drop('y', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
!pip install mlflow
!pip install optuna
!pip install xgboost

Collecting mlflow
  Downloading mlflow-2.13.1-py3-none-any.whl (25.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [22]:
import optuna
import mlflow.sklearn
from xgboost import XGBClassifier
from mlflow.tracking import MlflowClient


In [23]:
categorical_features = data.select_dtypes(include='object').columns
numeric_features = data.select_dtypes(exclude='object').columns

In [24]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [26]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', XGBClassifier())])

In [27]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', XGBClassifier())])

In [28]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 150)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 7)
    max_features = trial.suggest_categorical('max_features', [1, 'sqrt', 'log2'])

    model = XGBClassifier(n_estimators=n_estimators,
                          learning_rate=learning_rate,
                          max_depth=max_depth,
                          max_features=max_features,
                          random_state=42)
    model.fit(X_train, y_train)
    return -model.score(X_test, y_test)

In [31]:
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
numeric_features = [col for col in X.columns if col not in categorical_features]

onehot_encoder = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [32]:
X_train_encoded = onehot_encoder.fit_transform(X_train)
X_test_encoded = onehot_encoder.transform(X_test)

In [33]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 150, 350)
    learning_rate = trial.suggest_float('learning_rate', 0.03, 1.0, log=True)
    max_depth = trial.suggest_int('max_depth', 4, 8)
    max_features = trial.suggest_categorical('max_features', [1, 'sqrt', 'log2'])

    model = XGBClassifier(n_estimators=n_estimators,
                          learning_rate=learning_rate,
                          max_depth=max_depth,
                          max_features=max_features,
                          random_state=42)
    model.fit(X_train_encoded, y_train)
    return -model.score(X_test_encoded, y_test)

In [34]:
with mlflow.start_run():
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)

    best_params = study.best_params
    best_model = XGBClassifier(**best_params)
    best_model.fit(X_train_encoded, y_train)

    mlflow.log_params(best_params)
    mlflow.log_metric('accuracy', best_model.score(X_test_encoded, y_test))
    mlflow.sklearn.log_model(best_model, 'model')

[I 2024-06-03 22:09:31,723] A new study created in memory with name: no-name-65b6bb52-17d4-4499-aea5-2fb7475ce48a
Parameters: { "max_features" } are not used.

[I 2024-06-03 22:09:40,355] Trial 0 finished with value: -0.9048312697256615 and parameters: {'n_estimators': 191, 'learning_rate': 0.7889835793977172, 'max_depth': 6, 'max_features': 1}. Best is trial 0 with value: -0.9048312697256615.
Parameters: { "max_features" } are not used.

[I 2024-06-03 22:09:42,366] Trial 1 finished with value: -0.9159990288905074 and parameters: {'n_estimators': 279, 'learning_rate': 0.052506638012149116, 'max_depth': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: -0.9159990288905074.
Parameters: { "max_features" } are not used.

[I 2024-06-03 22:09:44,251] Trial 2 finished with value: -0.9189123573682932 and parameters: {'n_estimators': 214, 'learning_rate': 0.035479668064152946, 'max_depth': 8, 'max_features': 'log2'}. Best is trial 2 with value: -0.9189123573682932.
Parameters: { "max_feat