**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Load data](#toc2_)    
- [Run PyCaret](#toc3_)    
- [Evaluate model](#toc4_)    
- [Finalize model](#toc5_)    
- [Save model](#toc6_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Imports](#toc0_)

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import r2_score

import pycaret.regression as pr

from controller import Controller

In [3]:
c = Controller('i02')

COLS_TO_DROP = [
    'PROJECT_ID',
    'TRAIN_VAL_TEST_SPLIT',
    'DEADLINE',
    'STATE_CHANGED_AT',
    'CREATED_AT',
    'LAUNCHED_AT',
    'KEYWORDS',
    
    'DESC',
    'NAME',
    'FINAL_STATUS',
    'BACKERS_COUNT',
]

PLOT_TYPES = [
    'pipeline',
    'residuals',
    'error',
    'cooks',
    'vc',
    'feature',
    'feature_all',
    'parameter',
    'tree',
]

# <a id='toc2_'></a>[Load data](#toc0_)

In [4]:
filepath = fr'{c.get_path_data_prepared()}/01_df_development.pkl'
df_development = pd.read_pickle(filepath)
df_development['BACKERS_COUNT_LOG'] = np.log1p(df_development['BACKERS_COUNT'])
df_development = df_development.drop(COLS_TO_DROP, axis=1)
df_development

Unnamed: 0,GOAL,DISABLE_COMMUNICATION,COUNTRY,CURRENCY,CREATE_LAUNCH_HOURS,CREATE_LAUNCH_HOURS_LOG,CREATE_DEADLINE_HOURS,CREATE_DEADLINE_HOURS_LOG,LAUNCHED_DEADLINE_HOURS,BACKERS_COUNT_LOG
0,60000.0,False,US,USD,2375.831389,7.773524,3095.831389,8.038135,720.000000,1.609438
1,800.0,False,US,USD,119.452500,4.791255,1400.657500,7.245411,1281.205000,3.737670
2,10000.0,False,US,USD,6345.470556,8.755654,7185.470556,8.879955,840.000000,0.693147
3,270.0,False,GB,GBP,0.920833,0.652759,192.920833,5.267450,192.000000,0.000000
4,5.0,False,GB,GBP,823.233611,6.714454,1544.233611,7.342930,721.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
108124,250.0,False,US,USD,835.094444,6.728742,1373.173056,7.225607,538.078611,3.218876
108125,5000.0,False,US,USD,285.817778,5.658847,1005.817778,6.914550,720.000000,0.000000
108126,45000.0,False,CA,CAD,310.938611,5.742806,1030.938611,6.939194,720.000000,3.610918
108127,3000.0,False,US,USD,152.635278,5.034581,873.635278,6.773807,721.000000,0.000000


In [5]:
path_stack_model = fr'{c.get_path_iteration()}/saved_best_overall_model_stack'
stack_model = pr.load_model(path_stack_model)
stack_model

Transformation Pipeline and Model Successfully Loaded


# <a id='toc3_'></a>[Run PyCaret](#toc0_)

In [6]:
def r2_adjusted(y_true, y_pred, **kwargs):
    n = y_true.shape[0]
    p = kwargs['num_predictors'] if 'num_predictors' in kwargs else 1
    r2 = r2_score(y_true, y_pred)
    r2_adj = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    return r2_adj


s1 = pr.setup(data=df_development, train_size=0.8, target='BACKERS_COUNT_LOG', session_id=42)
pr.add_metric('r2_adj', 'R2 Adjusted', r2_adjusted)
pr.get_metrics()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,BACKERS_COUNT_LOG
2,Target type,Regression
3,Original data shape,"(108129, 10)"
4,Transformed data shape,"(108129, 28)"
5,Transformed train set shape,"(86503, 28)"
6,Transformed test set shape,"(21626, 28)"
7,Numeric features,6
8,Categorical features,2
9,Preprocess,True


Unnamed: 0_level_0,Name,Display Name,Score Function,Scorer,Target,Args,Greater is Better,Custom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mae,MAE,MAE,<function mean_absolute_error at 0x7f5b4c9a9090>,neg_mean_absolute_error,pred,{},False,False
mse,MSE,MSE,<function mean_squared_error at 0x7f5b4c9a9240>,neg_mean_squared_error,pred,{},False,False
rmse,RMSE,RMSE,<function mean_squared_error at 0x7f5b4c9a9240>,neg_root_mean_squared_error,pred,{'squared': False},False,False
r2,R2,R2,<function r2_score at 0x7f5b4c9a9510>,r2,pred,{},True,False
rmsle,RMSLE,RMSLE,<function RMSLEMetricContainer.__init__.<local...,"make_scorer(root_mean_squared_log_error, great...",pred,{},False,False
mape,MAPE,MAPE,<function MAPEMetricContainer.__init__.<locals...,"make_scorer(mean_absolute_percentage_error, gr...",pred,{},False,False
r2_adj,R2 Adjusted,R2 Adjusted,<function r2_adjusted at 0x7f5b4c9e0160>,make_scorer(r2_adjusted),pred,{},True,True


# <a id='toc4_'></a>[Evaluate model](#toc0_)

In [7]:
def save_model_plots(location, plots, model):
    len_plots = len(plots)
    destination_dir = Path(location)
    destination_dir.mkdir(parents=True, exist_ok=True)

    for i, plot_type in enumerate(plots):
        print(f'Creating [{i+1}/{len_plots}] {plot_type}')
        
        try:
            plot_path = pr.plot_model(
                model,
                plot=plot_type,
                plot_kwargs={'title': plot_type},
                save=True
            )

            # Move plot to "location".
            if plot_path is not None:
                source_file = Path(plot_path)
                new_filename = f'{plot_type}.png'
                destination_file = destination_dir / new_filename
                source_file.rename(destination_file)
        except Exception as e:
            print(str(e))
            print(f'ERROR: {plot_type} could not be produced!')

In [8]:
dir_save = f'{c.get_path_iteration()}/plots_stack_model'
save_model_plots(dir_save, PLOT_TYPES, stack_model)

Creating [2/9] residuals


Creating [3/9] error


Creating [4/9] cooks


Cannot cast ufunc 'svd_n_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'
ERROR: cooks could not be produced!
Creating [5/9] vc


Plot not supported for this estimator. Try different estimator.
ERROR: vc could not be produced!
Creating [6/9] feature
Feature Importance and RFE plots not available for estimators that doesnt support coef_ or feature_importances_ attribute.
ERROR: feature could not be produced!
Creating [7/9] feature_all
Feature Importance and RFE plots not available for estimators that doesnt support coef_ or feature_importances_ attribute.
ERROR: feature_all could not be produced!
Creating [8/9] parameter


Unnamed: 0,Parameters
cv,5
estimators,"[('Light Gradient Boosting Machine', LGBMRegre..."
final_estimator,LinearRegression(n_jobs=-1)
n_jobs,-1
passthrough,True
verbose,0


[Errno 2] No such file or directory: 'Hyperparameters.png' -> '/home/work/Dev/kickstarter/02_Outputs/i02/plots_stack_model/parameter.png'
ERROR: parameter could not be produced!
Creating [9/9] tree
Decision Tree plot is only available for scikit-learn Decision Trees and Forests, Ensemble models using those or Stacked models using those as meta (final) estimators.
ERROR: tree could not be produced!


# <a id='toc5_'></a>[Finalize model](#toc0_)

In [9]:
final_stack_model = pr.finalize_model(stack_model)
final_stack_model

# <a id='toc6_'></a>[Save model](#toc0_)

In [10]:
model_name = 'final_stack_model.pkl'
_, model_path = pr.save_model(final_stack_model, model_name)

if model_path is not None:
    destination_dir = Path(c.get_path_iteration())
    destination_dir.mkdir(parents=True, exist_ok=True)
    
    source_file = Path(model_path)
    destination_file = destination_dir / model_name
    source_file.rename(destination_file)

Transformation Pipeline and Model Successfully Saved
