In [None]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
import mlflow

In [None]:
# If error is thrown with below cell because config.json is not found, os.chdir may solve the problem.
import os
os.chdir(path="/mnt/batch/tasks/shared/LS_root/mounts/clusters/shuitcpueastus01/code/Users/<alias>/<dir>")

In [None]:
# Before run this cell, prepare config.json and place repository root.
ml_client = MLClient.from_config(credential=DefaultAzureCredential(exclude_shared_token_cache_credential=True),
                     logging_enable=True)

In [None]:
azureml_mlflow_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
mlflow.set_tracking_uri(azureml_mlflow_uri)

In [None]:
experiment_name = 'nyc_taxi_regression_notebook'
mlflow.set_experiment(experiment_name)

In [None]:
train = pd.read_csv("data/nyc_taxi_train_dataset.csv")
test = pd.read_csv("data/nyc_taxi_test_dataset.csv")

x_train = train[train.columns[train.columns != "totalAmount"]]
y_train = train["totalAmount"]

x_test = test[test.columns[test.columns != "totalAmount"]]
y_test = test["totalAmount"]

In [None]:
params = {
        "boosting_type": "gbdt",
        "metric": "rmse",
        "learning_rate": 0.1,
        "num_leaves": 10,
        "min_data_in_leaf": 2,
        "num_iteration": 100,
        "task": "train",
        "objective": "regression",
    }

In [None]:
with mlflow.start_run() as run:
    mlflow.lightgbm.autolog(registered_model_name="nyc_taxi_regressor_lightgbm")
    train_dataset = lightgbm.Dataset(x_train, y_train)
    test_dataset = lightgbm.Dataset(x_test, y_test, reference=train_dataset)
    gbm = lightgbm.train(params, train_dataset, num_boost_round=50, valid_sets=test_dataset, early_stopping_rounds=10)