In [1]:
import hsfs
from hops import pandas_helper as pd_helper
import pandas as pd
from hops import hdfs
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
14,application_1605687609616_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.


## Connect to Feature Store And Get Reference to Training Dataset

In [2]:
import hsfs
connection = hsfs.connection()
fs = connection.get_feature_store()
td = fs.get_training_dataset("real_estate_price", version=1)

Connected. Call `.close()` to terminate connection gracefully.

### Inspect Training Dataset Metadata

In [3]:
td.location

'hopsfs://10.0.0.247:8020/Projects/dataai/dataai_Training_Datasets/real_estate_price_1'

In [4]:
td.splits

{'test': 0.2, 'train': 0.7, 'validate': 0.1}

In [5]:
td.label

['sale_price']

In [6]:
td.data_format

'csv'

## Define Training Function And Tune Model

In [None]:
train_files = [path for path in hdfs.ls(td.location + "/train") if ".csv" in path]
test_files = [path for path in hdfs.ls(td.location + "/test") if ".csv" in path]

def train(max_depth, lr, n_estimators):
    from hops import pandas_helper as pd_helper
    import pandas as pd
    from hops import hdfs
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.metrics import mean_squared_error
    from torch.utils.tensorboard import SummaryWriter
    from maggy import tensorboard
    import joblib

    train_df = pd.concat((pd_helper.read_csv(f) for f in train_files))

    X_train = train_df.drop(td.label, axis=1)
    y_train = train_df[td.label]

    test_df = pd.concat((pd_helper.read_csv(f) for f in test_files))

    X_test = test_df.drop(td.label, axis=1)
    y_test = test_df[td.label]
    
    reg = GradientBoostingRegressor(max_depth=max_depth, learning_rate=lr, n_estimators=n_estimators, random_state=0)
    reg.fit(X_train, y_train.values.ravel())
    
    predictions = reg.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = reg.score(X_test, y_test)
    
    print("MSE: {}".format(mse))
    print("RMSE: {}".format(rmse))
    print("R2: {}".format(r2))
    
    # write model
    h = hdfs.get_fs()
    with h.open_file(tensorboard.logdir() + "/model.pkl", "w") as f:
        joblib.dump(reg, f)

    return {"mse": mse, "rmse": rmse, "r2": r2}

### Define Searchspace

In [None]:
from maggy import Searchspace

sp = Searchspace(max_depth=('INTEGER', [2, 10]), lr=('DISCRETE', [0.1, 0.01, 0.001, 0.0001]), n_estimators=('INTEGER', [50, 500]))

### Launch Experiment

In [None]:
from maggy import experiment
result = experiment.lagom(train, 
                           searchspace=sp, 
                           optimizer='randomsearch', 
                           direction='min',
                           num_trials=10, 
                           name='real_estate_price',
                           es_policy='none',
                           optimization_key="mse"
                          )

In [10]:
from hops import model, serving
MODEL_NAME = "real_estate_pricing"
EVALUATION_METRIC = "mse"
model_path = "Experiments/application_1605687609616_0001_2/754f9bd9e2eadbc4"

In [None]:
hdfs.cp("Jupyter/real_estate_pricing_serving.py", model_path + "/real_estate_pricing_serving.py", overwrite=True)

In [None]:
model.export(model_path, MODEL_NAME, metrics={EVALUATION_METRIC: result["best_val"]})

In [None]:
real_estate_model = model.get_best_model(MODEL_NAME, EVALUATION_METRIC, model.Metric.MIN)
script_path = "Models/" + MODEL_NAME + "/" + str(real_estate_model["version"]) + "/real_estate_pricing_serving.py"
serving.create_or_update(script_path, "RealEstateServing", model_version=real_estate_model['version'], serving_type="SKLEARN")

In [8]:
val_files = [path for path in hdfs.ls(td.location + "/validate") if ".csv" in path]
val_df = pd.concat((pd_helper.read_csv(f) for f in val_files))

X_val = val_df.drop(td.label, axis=1)
y_val = val_df[td.label]