In [None]:
from sklearn.model_selection import train_test_split
from mlModules import control
import pandas as pd
import catboost
import pickle
import json

In [None]:
def cutData(data_to_cut):
    # return data_to_cut[(data_to_cut["LT"] > 0) & (data_to_cut["LT"] < 20)]
    return data_to_cut

In [None]:
def split(data_to_split):
    fit, test = train_test_split(data_to_split, train_size=control.fit_test_split, random_state=42)
    train, valid = train_test_split(fit, train_size=control.train_valid_split, random_state=42)
    return test, train, valid

In [None]:
def createModel(train_set, valid_set):
    model = catboost.CatBoostRegressor(
        learning_rate = control.model_params["learning_rate"],
        iterations = control.model_params["iterations"],
        early_stopping_rounds = control.model_params["early_stopping_rounds"],
        depth = control.model_params["depth"]
    )
    model.fit(
        train_set[control.x],
        train_set[control.y], 
        cat_features = control.categorical, 
        verbose=50, 
        eval_set = (valid_set[control.x], valid_set[control.y])
    )
    return model

In [None]:
def createMetrics(model_to_use):
    vrmse = model_to_use.get_best_score()['validation']['RMSE']
    lrmse = model_to_use.get_best_score()['learn']['RMSE']
    return vrmse, lrmse

In [None]:
def createMetadata(data_to_use):
    metadata = data_to_use.groupby(['LOCID', 'LOCTO', 'PRDID']).mean().reset_index()
    metadata = metadata.drop(['MONTH', 'WEEKDAY', 'WEEKNUMBER', "LT", "IDATETIME"], axis = 1)
    return metadata

In [None]:
def serializeAndOutput(lrmse, vrmse, model, metadata):
    metrics = str(json.dumps({"Learn RMSE" : str(lrmse), "Validation RMSE" : str(vrmse)}))
    model_blob = pickle.dumps(model)
    metadata_blob = pickle.dumps(metadata)
    api.send("out", metrics)
    api.send("modelOut", model_blob)
    api.send("testOut", metadata_blob)

In [None]:
def recieve(data):
    global dataframe
    dataframe = pd.read_json(data)
    cat = {key: "str" for key in control.categorical}
    dataframe = dataframe.astype(cat)
    print(dataframe.head())
    processed_data = cutData(dataframe)
    test, train, valid = split(processed_data)
    model = createModel(train, valid)
    vrmse, lrmse = createMetrics(model)
    metadata = createMetadata(processed_data)
    serializeAndOutput(lrmse, vrmse, model, metadata)
    
api.set_port_callback('in', recieve)