In [4]:
def prepare_data_for_modelling(df, target, predictors, train_ratio=0.7, scaler=StandardScaler()):

    ''' Prepare the dataset for the modelling - we do the train/test split (the first x% of observations go to train dataset, the rest to test dataset), 
        standardize the predictors and return objects ready for the modelling.

    Input:
    df (DataFrame): Dataframe with all features to be considered in model creation
    target (string): Name of the target variable 
    train_ratio: the proportion of observations to be assigned to the train dataset (first x observations). Default - 70/30 split, as in the original work in the case of CNN. 
    scaler: the scaler used to scale the predictors. Default - standard scaler used in the original solution.

    Output:
    Objects ready to be used in the modelling (X/y train/test).

    Notes:
    We use the chronological assignment to the train/test datasets, not the random sampling, as we deal with time series data.
    '''
    
    # divide data into train and test - we do chronologically as in the case of CNN in previous analysis
    train = df[:int(train_ratio * df.shape[0])]
    test = df[int(train_ratio * df.shape[0]):]
    
    # scale the predictors - train and test separately!
    X_train = scaler.fit_transform(train[predictors])
    X_test = scaler.fit_transform(test[predictors])

    # assign the target to separate objects - according to standards used in Python DS
    y_train = train[target]
    y_test = test[target]
    
    return X_train, y_train, X_test, y_test 

NameError: name 'StandardScaler' is not defined

In [None]:
def create_model_evaluate( X_train, y_train, X_test, y_test, 
                           eval_metric = mean_absolute_percentage_error,
                           model=XGBRegressor, 
                           model_params={'objective' : "reg:squarederror",
                                         'reg_lambda' : 0.8,
                                         'alpha' :0.9,
                                         'n_estimators' : 100,
                                         'colsample_bytree' : 0.6,
                                         'gamma':0.85 , 
                                         'eta' : 0.036,
                                         'max_depth' : 25, 
                                         'min_child_weight' : 1, 
                                         'subsample':0.8, 
                                         'num_parallel_tree' : 4, 
                                         #'early_stopping_rounds' : 50,
                                         'random_state' : 123}):

    ''' Create and evaluate a chosen ML model. 

    Input:
    X_train, y_train, X_test, y_test (DataFrame/Matrix/Array): objects used in the modelling as the input (calculated using e.g. prepare_data_for_modelling() function).
    eval_metric: evaluation metric used for the model evaluation. Default - mean absolute percentage error as in the original work.
    model: the model to be calculated. Default - XGBoost Regressor, as in the original work (the first model).
    model_params: a dictionary containing information about the hyperparameters of the model to be trained. The default hyperparameters set as in the XGBoost in the original work
                  (apart from the early_stopping_rounds - we exclude it to make the function more universal).
    
    Output:
    No exact output, we just print the information about the evaluation metric value for the train and test dataset.
    '''
                                             
    model_to_fit = model(**model_params) # attach the hyperaparameters for our model
    
    model_to_fit.fit(X_train, y_train) # fit/train the chosen model
    
    # Make predictions on train and test data
    predictions_train = model_to_fit.predict(X_train) 
    predictions_test = model_to_fit.predict(X_test)
    
    # Calculate the evaluation metric value on train and test data
    eval_train = eval_metric(y_train, predictions_train)
    eval_test = eval_metric(y_test, predictions_test)

    # print the information about the evaluation metric value for train/test
    print("Model evaluation: \n Result for train data: {}. \n Result for test data: {}.".format(round(eval_train,3), round(eval_test,3)))