In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.impute  import SimpleImputer
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Dict, List, Tuple
from sklearn.preprocessing import OrdinalEncoder
import json
from argparse import Namespace
from typing import Dict

import mlflow
import numpy as np
import optuna
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
import json
from argparse import Namespace
from typing import Dict
# import matplotlib.pyplot as plt
# mlflow.set_tracking_uri("file:///tmp/my_tracking")
mlflow.set_tracking_uri("http://localhost:8003")

In [2]:
def elapsed_years(df, var):
    # capture difference between year variable and year the house was sold
    df[var] = df['YrSold'] - df[var]
    return df


def get_data_splits(X: pd.Series, y: np.ndarray, train_size: float = 0.7) -> Tuple:
    """Generate balanced data splits.
    Args:
        X (pd.Series): input features.
        y (np.ndarray): encoded labels.
        train_size (float, optional): proportion of data to use for training. Defaults to 0.7.
    Returns:
        Tuple: data splits as Numpy arrays.
    """
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [3]:
valid_columns = [
            "LotArea",
            "OverallQual",
            "YearRemodAdd",
            "BsmtQual",
            "BsmtFinSF1",
            "TotalBsmtSF",
            "1stFlrSF",
            "2ndFlrSF",
            "GrLivArea",
            "GarageCars",
            "SalePrice",
            "YrSold"
        ]
def get_data():
    df = pd.read_csv('../data/train.csv')
    df =df[valid_columns]
    df = df[df.SalePrice.notnull()]  # drop rows w/ no tag

    # get columns as date, categorical and numerical types
    vars_dates = ['YearRemodAdd']
    vars_cat = [var for var in df.columns if df[var].dtypes == "O"]
    vars_num = [var for var in df.columns if df[var].dtypes != "O" and var not in ["Id"]]


    imputer = SimpleImputer(strategy='most_frequent')
    df[vars_num] = imputer.fit_transform(df[vars_num])

    imputer = SimpleImputer(strategy='constant', fill_value='missing')
    df[vars_cat] = imputer.fit_transform(df[vars_cat])

    for var in ['YearRemodAdd']:
        df = elapsed_years(df, var)

    ordinal_enc = OrdinalEncoder()
    df[vars_cat] = ordinal_enc.fit_transform(df[vars_cat])
    df.drop(columns=["YrSold"],inplace=True)


    X_train, X_val, X_test, y_train, y_val, y_test = get_data_splits(
            X=df[df.columns[~df.columns.isin(["SalePrice"])]].to_numpy(),
            y=df.SalePrice.to_numpy() / 1,
        )
    return X_train, X_val, X_test, y_train, y_val, y_test,ordinal_enc

In [4]:
def load_dict(filepath: str) -> Dict:
    """Load a dictionary from a JSON's filepath.
    Args:
        filepath (str): location of file.
    Returns:
        Dict: loaded JSON data.
    """
    with open(filepath, "r") as fp:
        d = json.load(fp)
    return d
args = Namespace(**load_dict(filepath='args_par.json'))

In [5]:
import argparse
def namespace_to_dict(namespace):
    return {
        k: namespace_to_dict(v) if isinstance(v, argparse.Namespace) else v
        for k, v in vars(namespace).items()
    }

In [6]:

# gbm = lgb.train(namespace_to_dict(args),
#                 lgb_train,
#                 num_boost_round=200,
#                 valid_sets=lgb_eval,
#                 callbacks=[lgb.early_stopping(stopping_rounds=5)])

In [7]:
# gbm.predict(X_test, num_iteration=gbm.best_iteration)*100000

# x_ax = range(len(y_test))
# plt.figure(figsize=(12, 6))
# plt.plot(x_ax, y_test*1, label="original")
# plt.plot(x_ax, gbm.predict(X_test, num_iteration=gbm.best_iteration)*1, label="predicted")
# plt.title("Boston dataset test and predicted data")
# plt.xlabel('X')
# plt.ylabel('Price')
# plt.legend(loc='best',fancybox=True, shadow=True)
# plt.grid(True)
# plt.show()  

In [8]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,median_absolute_error

In [9]:
# y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

def get_metrics(y_true,y_pred):
#     median_absolute_error=median_absolute_error(y_test, y_pred)
#     mean_absolute_percentage_error=mean_absolute_percentage_error(y_test, y_pred)
    mean_squared_error_=mean_squared_error(y_true, y_pred)
#     mean_absolute_error=mean_absolute_error(y_test, y_pred)
    r2_score_=r2_score(y_true, y_pred, multioutput='variance_weighted')
    return (mean_squared_error_,r2_score_)

def get_eval_metric(y_true,y_pred):
    metrics = {"overall": {}}

    # Overall metrics
    mse,r2 = get_metrics(y_true, y_pred)

    metrics["overall"]["MSE"] = mse
    metrics["overall"]["R2"] = r2
    metrics["overall"]["num_samples"] = np.float64(len(y_true))

    return metrics

In [10]:

def train(args: Namespace, trial: optuna.trial._trial.Trial = None) -> Dict:
    """Train model on data.
    Args:
        args (Namespace): arguments to use for training.
        df (pd.DataFrame): data for training.
        trial (optuna.trial._trial.Trial, optional): optimization trial. Defaults to None.
    Raises:
        optuna.TrialPruned: early stopping of trial if it's performing poorly.
    Returns:
        Dict: artifacts from the run.
    """

    X_train, X_val, X_test, y_train, y_val, y_test,ordinal_enc = get_data()
        
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # Training
    model = lgb.train(namespace_to_dict(args),
                lgb_train,
                num_boost_round=200,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=5)])
    
  
    train_mse,train_r2=get_metrics(y_train,model.predict(X_train, num_iteration=model.best_iteration))
    val_mse,val_r2=get_metrics(y_val,model.predict(X_val, num_iteration=model.best_iteration))
    
   
    print(f"train_mse: {train_mse:.5f}, " f"train_r2: {train_r2:.5f}," f"val_mse: {val_mse:.5f}, " f"val_r2: {val_r2:.5f}")

    # Log
    if not trial:
        mlflow.log_metrics({"train_mse": train_mse, "val_mse": val_mse,"train_r2": train_r2, "val_r2":val_r2})

    # Pruning (for optimization in next section)
    if trial:  # pragma: no cover, optuna pruning
        trial.report(val_mse, val_r2)
        if trial.should_prune():
            raise optuna.TrialPruned()

    # Threshold
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    performance = get_eval_metric(y_true=y_test, y_pred=y_pred)

    return {
        "args": args,
        "model": model,
        "performance": performance,
        "ordinal_enc": ordinal_enc,
    }


def objective(args: Namespace, trial: optuna.trial._trial.Trial) -> float:
    """Objective function for optimization trials.
    Args:
        args (Namespace): arguments to use for training.
        df (pd.DataFrame): data for training.
        trial (optuna.trial._trial.Trial, optional): optimization trial.
    Returns:
        float: metric value to be used for optimization.
    """
    # Parameters to tune
    args.learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 1e0)
    args.power_t = trial.suggest_uniform("power_t", 0.1, 0.5)

    # Train & evaluate
    artifacts = train(args=args, trial=trial)

    # Set additional attributes
    overall_performance = artifacts["performance"]["overall"]
#     logger.info(json.dumps(overall_performance, indent=2))
    trial.set_user_attr("R2", overall_performance["R2"])

    return overall_performance["R2"]

In [11]:
#   "boosting_type": "gbdt",
#     "objective": "regression",
#     "metric": [
#         "l2",
#         "l1"
#     ],
#     "num_leaves": 50,
#     "learning_rate": 0.05,
#     "feature_fraction": 0.9,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 10,
#     "verbose": -1
def save_dict(d: Dict, filepath: str, cls=None, sortkeys: bool = False) -> None:
    """Save a dictionary to a specific location.
    Args:
        d (Dict): data to save.
        filepath (str): location of where to save the data.
        cls (optional): encoder to use on dict data. Defaults to None.
        sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
    """
    with open(filepath, "w") as fp:
        json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
        fp.write("\n")       


In [101]:
# train(Namespace(**load_dict(filepath='args_par.json')))

In [102]:
# mlflow.end_run()

In [12]:
# Logger
import logging,sys
from pathlib import Path
from rich.logging import RichHandler
LOGS_DIR = Path("./", "logs")
LOGS_DIR.mkdir(parents=True, exist_ok=True)

logging_config = {
    "version": 1,
    "disabel_existing_loggers": False,
    "formatters": {
        "minimal": {"format": "%(message)s"},
        "detailed": {
            "format": "%(levelname)s %(asctime)s [%(name)s:%(filename)s:%(funcName)s:%(lineno)d]\n%(message)s\n"
        },
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "stream": sys.stdout,
            "formatter": "minimal",
            "level": logging.DEBUG,
        },
        "info": {
            "class": "logging.handlers.RotatingFileHandler",
            "filename": Path(LOGS_DIR, "info.log"),
            "maxBytes": 10485760,  # 1 MB
            "backupCount": 10,
            "formatter": "detailed",
            "level": logging.INFO,
        },
        "error": {
            "class": "logging.handlers.RotatingFileHandler",
            "filename": Path(LOGS_DIR, "error.log"),
            "maxBytes": 10485760,  # 1 MB
            "backupCount": 10,
            "formatter": "detailed",
            "level": logging.ERROR,
        },
    },
    "root": {
        "handlers": ["console", "info", "error"],
        "level": logging.INFO,
        "propagate": True,
    },
}

logging.config.dictConfig(logging_config)
logger = logging.getLogger()
logger.handlers[0] = RichHandler(markup=True)

In [13]:
from pathlib import Path
import tempfile
from numpyencoder import NumpyEncoder
from optuna.integration.mlflow import MLflowCallback
import joblib
def train_model(
    args_fp: str = "./args_par.json",
    experiment_name: str = "baselines_1",
    run_name: str = "gbr",
    test_run: bool = False,
) -> None:
    """Train a model given arguments.
    Args:
        args_fp (str): location of args.
        experiment_name (str): name of experiment.
        run_name (str): name of specific run in experiment.
        test_run (bool, optional): If True, artifacts will not be saved. Defaults to False.
    """
    # Load labeled data
#     projects_fp = Path(config.DATA_DIR, "train_cleaned.json")
#     projects = util.load_dict(filepath=projects_fp)
#     df = pd.DataFrame(projects)

    # Train
    args = Namespace(**load_dict(filepath='args_par.json'))
#     namespace_to_dict(args)
    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name):
        run_id = mlflow.active_run().info.run_id
        logger.info(f"Run ID: {run_id}")
        artifacts = train(args=args)
        performance = artifacts["performance"]
        logger.info(json.dumps(performance, indent=2))

        # Log metrics and parameters
        performance = artifacts["performance"]
#         print(performance)
        mlflow.log_metrics({"MSE": performance["overall"]["MSE"]})
        mlflow.log_metrics({"R2": performance["overall"]["R2"]})
        mlflow.log_params(vars(artifacts["args"]))

        # Log artifacts
        with tempfile.TemporaryDirectory() as dp:
            save_dict(vars(artifacts["args"]), Path(dp, "args_par.json"), cls=NumpyEncoder)
            joblib.dump(artifacts["model"], Path(dp, "model.pkl"))
            save_dict(performance, Path(dp, "performance.json"))
            joblib.dump(artifacts["ordinal_enc"], Path(dp, "ordinal_enc.pkl"))
            mlflow.log_artifacts(dp)

    # Save to config
    if not test_run:  # pragma: no cover, actual run
        open(Path("./", "run_id.txt"), "w").write(run_id)
        save_dict(performance, Path("./", "performance.json"))


def optimize(
    args_fp: str = "./args_par.json",
    study_name: str = "optimization",
    num_trials: int = 20,
) -> None:
    """Optimize hyperparameters.
    Args:
        args_fp (str): location of args.
        study_name (str): name of optimization study.
        num_trials (int): number of trials to run in study.
    """
    # Load labeled data
#     projects_fp = Path(config.DATA_DIR, "train_cleaned.json")
#     projects = util.load_dict(filepath=projects_fp)
#     df = pd.DataFrame(projects)

    # Optimize
    args = Namespace(**load_dict(filepath=args_fp))
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(study_name=study_name, direction="maximize", pruner=pruner)
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="R2")
    study.optimize(
        lambda trial: objective(args, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # Best trial
    trials_df = study.trials_dataframe()
    # print(trials_df.head())
    trials_df = trials_df.sort_values(["user_attrs_R2"], ascending=False)
    args = {**args.__dict__, **study.best_trial.params}
    save_dict(d=args, filepath=args_fp, cls=NumpyEncoder)
    logger.info(f"\nBest value (R2): {study.best_trial.value}")
    logger.info(f"Best hyperparameters: {json.dumps(study.best_trial.params, indent=2)}")


def load_artifacts(run_id: str = None) -> Dict:
    """Load artifacts for a given run_id.
    Args:
        run_id (str): id of run to load artifacts from.
    Returns:
        Dict: run's artifacts.
    """
    if not run_id:
        run_id = open(Path("./", "run_id.txt")).read()

    # Locate specifics artifacts directory
    experiment_id = mlflow.get_run(run_id=run_id).info.experiment_id
    artifacts_dir = Path("./", experiment_id, run_id, "artifacts")

    # Load objects from run
    args = Namespace(**load_dict(filepath=Path(artifacts_dir, "args.json")))
    model = joblib.load(Path(artifacts_dir, "model.pkl"))
    ordinal_enc = joblib.load(Path(artifacts_dir, "ordinal_enc.pkl"))
    performance = load_dict(filepath=Path(artifacts_dir, "performance.json"))

    return {
        "args": args,
        "model": model,
        "performance": performance,
        "ordinal_enc": ordinal_enc,
    }


In [106]:
# train_model()
# optimize('./args_par.json', study_name="optimization", num_trials=20)

  mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="R2")
  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[11]	valid_0's l2: 6.08336e+08	valid_0's l1: 17750.1
train_mse: 678835539.86193, train_r2: 0.89874,val_mse: 608336157.41514, val_r2: 0.89266


  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[11]	valid_0's l2: 1.30938e+09	valid_0's l1: 21316.4
train_mse: 670296052.33126, train_r2: 0.89591,val_mse: 1309380474.42838, val_r2: 0.78946


  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[8]	valid_0's l2: 1.03335e+09	valid_0's l1: 19692.5
train_mse: 870320833.33549, train_r2: 0.85998,val_mse: 1033348640.74970, val_r2: 0.87414


  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[42]	valid_0's l2: 1.02837e+09	valid_0's l1: 18989.3
train_mse: 612708835.95189, train_r2: 0.90419,val_mse: 1028372980.91414, val_r2: 0.83631


  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[30]	valid_0's l2: 8.82562e+08	valid_0's l1: 18874.4
train_mse: 624644980.46993, train_r2: 0.90317,val_mse: 882562431.68685, val_r2: 0.86728


  mode = stats.mode(array)
  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[20]	valid_0's l2: 4.92324e+08	valid_0's l1: 16132.6
train_mse: 654839294.91838, train_r2: 0.89865,val_mse: 492324269.68615, val_r2: 0.90394
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[66]	valid_0's l2: 1.08312e+09	valid_0's l1: 17376.2
train_mse: 568014459.03928, train_r2: 0.90572,val_mse: 1083115974.97625, val_r2: 0.86750
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[5]	valid_0's l2: 1.42837e+09	valid_0's l1: 21943.9
train_mse: 573630599.05524, train_r2: 0.89927,val_mse: 1428371758.63743, val_r2: 0.79082


  mode = stats.mode(array)
  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[4]	valid_0's l2: 9.4875e+08	valid_0's l1: 20879.7
train_mse: 697411224.11578, train_r2: 0.88298,val_mse: 948749670.53462, val_r2: 0.81703
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[9]	valid_0's l2: 1.44426e+09	valid_0's l1: 22707.4
train_mse: 435924797.38536, train_r2: 0.92676,val_mse: 1444261069.90996, val_r2: 0.80741
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[180]	valid_0's l2: 7.38553e+08	valid_0's l1: 17758.1
train_mse: 711354862.80805, train_r2: 0.88973,val_mse: 738553165.06609, val_r2: 0.89525
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Did not meet early stopping. Best iteration is:
[200]	valid_0's l2: 6.85033e+08	valid_0's l1: 18158.8
train_mse: 722557472.73892, train_r2: 0.89079,val_mse: 685033016.75769, val_r2: 0.87753
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[140]	valid_0's l2: 9.77465e+08	valid_0's l1: 18894.7
train_mse: 600827273.53761, train_r2: 0.90425,val_mse: 977464807.91228, val_r2: 0.86594
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[93]	valid_0's l2: 5.80588e+08	valid_0's l1: 15978.2
train_mse: 601642385.63611, train_r2: 0.91240,val_mse: 580587827.50114, val_r2: 0.88554
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Did not meet early stopping. Best iteration is:
[200]	valid_0's l2: 1.75494e+09	valid_0's l1: 21410.3
train_mse: 589209642.07984, train_r2: 0.89822,val_mse: 1754938184.38561, val_r2: 0.77564
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[38]	valid_0's l2: 1.16804e+09	valid_0's l1: 19232.6
train_mse: 479486925.64499, train_r2: 0.92021,val_mse: 1168036712.36769, val_r2: 0.85561
Training until validation scores don't improve for 5 rounds


  mode = stats.mode(array)


Early stopping, best iteration is:
[133]	valid_0's l2: 7.00143e+08	valid_0's l1: 18773.4
train_mse: 623822566.51740, train_r2: 0.90887,val_mse: 700142528.73687, val_r2: 0.86065
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17]	valid_0's l2: 1.10306e+09	valid_0's l1: 18894.5
train_mse: 715673905.82405, train_r2: 0.89183,val_mse: 1103057796.40636, val_r2: 0.77390


  mode = stats.mode(array)
  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[24]	valid_0's l2: 6.59571e+08	valid_0's l1: 17725.2
train_mse: 650023560.68658, train_r2: 0.89648,val_mse: 659571087.28820, val_r2: 0.87056


  mode = stats.mode(array)


Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[92]	valid_0's l2: 1.48507e+09	valid_0's l1: 21288.5
train_mse: 439579269.16477, train_r2: 0.92795,val_mse: 1485073362.05660, val_r2: 0.80967


In [14]:
from typing import Dict, List
from pathlib import Path
import mlflow, joblib
# CONFIG_DIR = Path(BASE_DIR, "config")



def load_artifacts(run_id: str = None) -> Dict:
    """Load artifacts for a given run_id.
    Args:
        run_id (str): id of run to load artifacts from.
    Returns:
        Dict: run's artifacts.
    """
    if not run_id:
        run_id = open(Path('../config', "run_id.txt")).read()

    # Locate specifics artifacts directory
    experiment_id = mlflow.get_run(run_id=run_id).info.experiment_id
    artifacts_dir = Path('../stores/model', experiment_id, run_id, "artifacts")

    # Load objects from run
    args = Namespace(**load_dict(filepath=Path(artifacts_dir, "args.json")))
    model = joblib.load(Path(artifacts_dir, "model.pkl"))
    ordinal_enc = joblib.load(Path(artifacts_dir, "ordinal_enc.pkl"))
    performance = load_dict(filepath=Path(artifacts_dir, "performance.json"))

    return {
        "args": args,
        "model": model,
        "performance": performance,
        "ordinal_enc": ordinal_enc,
    }


In [16]:
artifacts = load_artifacts()
# !ls ../

In [31]:
def predict(texts: List, artifacts: Dict) -> List:
    """Predict tags for given texts.
    Args:
        texts (List): raw input texts to classify.
        artifacts (Dict): artifacts from a run.
    Returns:
        List: predictions for input texts.
    """
    # x = texts
    # ordinal_encode = Path(config.DATA_DIR, "encode_cat.pkl")
    # enc = utilload_ordinal_encoding(ordinal_encode)

    # # artifacts["vectorizer"].transform(texts)
    # predict(x)
    print(texts)
#     for x in texts:
    print(artifacts["ordinal_enc"].transform([[texts[3]]])[0][0])
#         x[3] = artifacts["ordinal_enc"].transform([[x[3]]])[0][0]
#         x[3] = artifacts["ordinal_enc"].transform([[x[3]]])
    print(texts)
    y_pred = custom_predict(
        y_pred=artifacts["model"].predict(texts),
    )
    # tags = artifacts["label_encoder"].decode(y_pred)
    predictions = [
        {
            "input_text": texts[i],
            "predicted_tag": y_pred[i],
        }
        for i in range(len(y_pred))
    ]
    return predictions

In [32]:
predict([11250,7,6,"TA",496,920,920,866,1786,2], artifacts)

[11250, 7, 6, 'TA', 496, 920, 920, 866, 1786, 2]
3.0
[11250, 7, 6, 'TA', 496, 920, 920, 866, 1786, 2]


NameError: name 'custom_predict' is not defined

In [9]:
################

In [10]:
def get_data_splits(X: pd.Series, y: np.ndarray, train_size: float = 0.7) -> Tuple:
    """Generate balanced data splits.
    Args:
        X (pd.Series): input features.
        y (np.ndarray): encoded labels.
        train_size (float, optional): proportion of data to use for training. Defaults to 0.7.
    Returns:
        Tuple: data splits as Numpy arrays.
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size)
    
    return X_train, X_val, y_train, y_val

In [11]:
import optuna
from optuna import Trial
def objective(trial: Trial, fast_check=False, target_meter=0, return_info=False):
    folds = 5
    seed = 142
    shuffle = False
    kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)
    X_train, X_val, X_test, y_train, y_val, y_test = get_data()
    y_valid_pred_total = np.zeros(X_train.shape[0])
    gc.collect()
#     print('target_meter', target_meter, X_train.shape)
#     L = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
#     categorical_features = L
#     print('cat_features', categorical_features)
    models = []
    valid_score = 0
    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx,:], y_train[train_idx]
        valid_data = X_train.iloc[valid_idx,:], y_train[valid_idx]
        print('train', len(train_idx), 'valid', len(valid_idx))
        a, b, c = fit_lgbm(trial, train_data, valid_data, cat_features=category_cols,
                                            num_rounds=1000)
        model, y_pred_valid, log = a, b, c
        y_valid_pred_total[valid_idx] = y_pred_valid
        models.append(model)
        gc.collect()
        valid_score += log["valid/l2"]
        if fast_check:
            break
    valid_score /= len(models)
    if return_info:
        return valid_score, models, y_pred_valid, y_train
    else:
        return valid_score

In [12]:
# Categorical parameter
optimizer = trial.suggest_categorical('optimizer', ['MomentumSGD', 'Adam'])

# Int parameter
num_layers = trial.suggest_int('num_layers', 1, 3)

# Uniform parameter
dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 1.0)

# Loguniform parameter
learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)

# Discrete-uniform parameter
drop_path_rate = trial.suggest_discrete_uniform('drop_path_rate', 0.0, 1.0, 0.1)

NameError: name 'trial' is not defined

In [13]:
def fit_lgbm(trial, train, val, devices=(-1,), seed=None, cat_features=None, num_rounds=1500):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    metric = 'l2'
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'objective': 'regression',
        'max_depth': -1,
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "metric": metric,
        "verbosity": -1,
    }
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 20

    d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]

    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

    # predictions
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    
    print('best_score', model.best_score)
    log = {'train/l2': model.best_score['training']['l2'],
           'valid/l2': model.best_score['valid_1']['l2']}
    return model, y_pred_valid, log

In [None]:
###########################

In [19]:
X_train,  X_test, y_train,  y_test = train_test_split(
        df[df.columns[~df.columns.isin(["SalePrice"])]].to_numpy(),
        df.SalePrice.to_numpy()
    )

In [None]:
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=train_size)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [20]:
X_train,  X_test, y_train,  y_test = train_test_split(
        X_test,y_test
        
    )

In [41]:
### get ordinal encode dict
from sklearn.preprocessing import OrdinalEncoder

In [16]:
df['BsmtQual'].unique()

array(['Gd', 'TA', 'Ex', nan, 'Fa'], dtype=object)

In [44]:
enc = OrdinalEncoder()
df['tr']=enc.fit_transform(df[['BsmtQual']])



In [45]:
df.head()

Unnamed: 0,LotArea,OverallQual,YearRemodAdd,BsmtQual,BsmtFinSF1,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageCars,SalePrice,YrSold,tr
0,8450,7,5,Gd,706,856,856,854,1710,2,208500,2008,2.0
1,9600,6,31,Gd,978,1262,1262,0,1262,2,181500,2007,2.0
2,11250,7,6,Gd,486,920,920,866,1786,2,223500,2008,2.0
3,9550,7,36,TA,216,756,961,756,1717,3,140000,2006,3.0
4,14260,8,8,Gd,655,1145,1145,1053,2198,3,250000,2008,2.0


In [46]:
dir(enc)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_X',
 '_check_n_features',
 '_fit',
 '_get_feature',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_transform',
 '_validate_data',
 'categories',
 'categories_',
 'dtype',
 'fit',
 'fit_transform',
 'get_params',
 'handle_unknown',
 'inverse_transform',
 'set_params',
 'transform',
 'unknown_value']

In [52]:
enc._get_tags()

{'non_deterministic': False,
 'requires_positive_X': False,
 'requires_positive_y': False,
 'X_types': ['categorical'],
 'poor_score': False,
 'no_validation': False,
 'multioutput': False,
 'allow_nan': False,
 'stateless': False,
 'multilabel': False,
 '_skip_test': False,
 '_xfail_checks': False,
 'multioutput_only': False,
 'binary_only': False,
 'requires_fit': True,
 'preserves_dtype': [numpy.float64],
 'requires_y': False,
 'pairwise': False}