In [1]:
from sklearn.model_selection import train_test_split
from mlModules import control
import pandas as pd
import catboost
import pickle
import json

ModuleNotFoundError: No module named 'sklearn'

In [2]:
def cutData(data_to_cut):
    return data_to_cut[(data_to_cut["LT"] > 0) & (data_to_cut["LT"] < 20)]
    #return data_to_cut

In [3]:
def split(data_to_split):
    fit, test = train_test_split(data_to_split, train_size=control.fit_test_split, random_state=42)
    train, valid = train_test_split(fit, train_size=control.train_valid_split, random_state=42)
    return test, train, valid

In [4]:
def createModel(train_set, valid_set):
    model = catboost.CatBoostRegressor(
        learning_rate = control.model_params["learning_rate"],
        iterations = control.model_params["iterations"],
        early_stopping_rounds = control.model_params["early_stopping_rounds"],
        depth = control.model_params["depth"]
    )
    model.fit(
        train_set[control.x],
        train_set[control.y], 
        cat_features = control.categorical, 
        verbose=50, 
        eval_set = (valid_set[control.x], valid_set[control.y])
    )
    return model

In [5]:
def createMetrics(model_to_use):
    vrmse = model_to_use.get_best_score()['validation']['RMSE']
    lrmse = model_to_use.get_best_score()['learn']['RMSE']
    return vrmse, lrmse

In [6]:
def createMetadata(data_to_use):
    metadata = data_to_use.groupby(['LOCID', 'LOCTO', 'PRDID']).mean().reset_index()
    metadata = metadata.drop(['MONTH', 'WEEKDAY', 'WEEKNUMBER', "LT", "IDATETIME"], axis = 1)
    return metadata

In [7]:
def serializeAndOutput(lrmse, vrmse, model, metadata):
    metrics = str(json.dumps({"Learn RMSE" : str(lrmse), "Validation RMSE" : str(vrmse)}))
    model_blob = pickle.dumps(model)
    metadata_blob = pickle.dumps(metadata)
    api.send("out", metrics)
    api.send("modelOut", model_blob)
    api.send("testOut", metadata_blob)

In [8]:
def recieve(data):
    global dataframe
    dataframe = pd.read_json(data)
    cat = {key: "str" for key in control.categorical}
    dataframe = dataframe.astype(cat)
    print(dataframe.head())
    processed_data = cutData(dataframe)
    test, train, valid = split(processed_data)
    model = createModel(train, valid)
    vrmse, lrmse = createMetrics(model)
    metadata = createMetadata(processed_data)
    serializeAndOutput(lrmse, vrmse, model, metadata)
    
api.set_port_callback('in', recieve)

           ROTIME   QTY        LT    PRDID LOCTO LOCID      IDATETIME  \
0  20181015002218  1728  0.028241  1000389  1034  1001  1539560498000   
1  20181015002220  1728  0.027512  1000389  1034  1001  1539560563000   
2  20181015002219  1728  0.025081  1000389  1034  1001  1539560772000   
3  20181015002216  1728  0.024259  1000389  1034  1001  1539560840000   
4  20181015002218  1728  0.022801  1000389  1034  1001  1539560968000   

   WEEKDAY  WEEKNUMBER  MONTH  
0        6          41     10  
1        6          41     10  
2        6          41     10  
3        6          41     10  
4        6          41     10  
0:	learn: 0.6439081	test: 0.6466806	best: 0.6466806 (0)	total: 225ms	remaining: 3m 44s
50:	learn: 0.5518522	test: 0.5659085	best: 0.5659085 (50)	total: 5.28s	remaining: 1m 38s
100:	learn: 0.5325071	test: 0.5489516	best: 0.5489516 (100)	total: 10.6s	remaining: 1m 34s
150:	learn: 0.5270910	test: 0.5439126	best: 0.5439126 (150)	total: 16.1s	remaining: 1m 30s
200:	learn: