**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Load data](#toc2_)    
- [Run PyCaret](#toc3_)    
- [Create predictions](#toc4_)    
- [Save to files](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Imports](#toc0_)

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import r2_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pycaret.regression as pr

/kaggle/input/glove-embeddings/glove.6B.200d.txt
/kaggle/input/glove-embeddings/glove.6B.50d.txt
/kaggle/input/glove-embeddings/glove.6B.300d.txt
/kaggle/input/glove-embeddings/glove.6B.100d.txt
/kaggle/input/kickstarter/01_df_development.pkl


In [19]:
cols_to_drop = [
    'PROJECT_ID',
    'TRAIN_VAL_TEST_SPLIT',
    'DEADLINE',
    'STATE_CHANGED_AT',
    'CREATED_AT',
    'LAUNCHED_AT',
    'KEYWORDS',
    
    'DESC',
    'NAME',
    'FINAL_STATUS',
    'BACKERS_COUNT',
]

# <a id='toc2_'></a>[Load data](#toc0_)

In [20]:
filepath = fr'/kaggle/input/kickstarter/01_df_development.pkl'
df_development = pd.read_pickle(filepath)
df_development['BACKERS_COUNT_LOG'] = np.log1p(df_development['BACKERS_COUNT'])
df_development = df_development.drop(cols_to_drop, axis=1)
df_development

Unnamed: 0,GOAL,DISABLE_COMMUNICATION,COUNTRY,CURRENCY,CREATE_LAUNCH_HOURS,CREATE_LAUNCH_HOURS_LOG,CREATE_DEADLINE_HOURS,CREATE_DEADLINE_HOURS_LOG,LAUNCHED_DEADLINE_HOURS,BACKERS_COUNT_LOG
0,60000.0,False,US,USD,2375.831389,7.773524,3095.831389,8.038135,720.000000,1.609438
1,800.0,False,US,USD,119.452500,4.791255,1400.657500,7.245411,1281.205000,3.737670
2,10000.0,False,US,USD,6345.470556,8.755654,7185.470556,8.879955,840.000000,0.693147
3,270.0,False,GB,GBP,0.920833,0.652759,192.920833,5.267450,192.000000,0.000000
4,5.0,False,GB,GBP,823.233611,6.714454,1544.233611,7.342930,721.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
108124,250.0,False,US,USD,835.094444,6.728742,1373.173056,7.225607,538.078611,3.218876
108125,5000.0,False,US,USD,285.817778,5.658847,1005.817778,6.914550,720.000000,0.000000
108126,45000.0,False,CA,CAD,310.938611,5.742806,1030.938611,6.939194,720.000000,3.610918
108127,3000.0,False,US,USD,152.635278,5.034581,873.635278,6.773807,721.000000,0.000000


In [21]:
df_development.isna().sum()

GOAL                         0
DISABLE_COMMUNICATION        0
COUNTRY                      0
CURRENCY                     0
CREATE_LAUNCH_HOURS          0
CREATE_LAUNCH_HOURS_LOG      0
CREATE_DEADLINE_HOURS        0
CREATE_DEADLINE_HOURS_LOG    0
LAUNCHED_DEADLINE_HOURS      0
BACKERS_COUNT_LOG            0
dtype: int64

# <a id='toc3_'></a>[Run PyCaret](#toc0_)

In [22]:
s1 = pr.setup(data=df_development, train_size=0.8, target='BACKERS_COUNT_LOG', session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,BACKERS_COUNT_LOG
2,Target type,Regression
3,Original data shape,"(108129, 10)"
4,Transformed data shape,"(108129, 28)"
5,Transformed train set shape,"(86503, 28)"
6,Transformed test set shape,"(21626, 28)"
7,Numeric features,6
8,Categorical features,2
9,Preprocess,True


In [23]:
def r2_adjusted(y_true, y_pred, **kwargs):
    n = y_true.shape[0]
    p = kwargs['num_predictors'] if 'num_predictors' in kwargs else 1
    r2 = r2_score(y_true, y_pred)
    r2_adj = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    return r2_adj

pr.add_metric('r2_adj', 'R2 Adjusted', r2_adjusted)
pr.get_metrics()

Unnamed: 0_level_0,Name,Display Name,Score Function,Scorer,Target,Args,Greater is Better,Custom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mae,MAE,MAE,<function mean_absolute_error at 0x787bd1c71fc0>,neg_mean_absolute_error,pred,{},False,False
mse,MSE,MSE,<function mean_squared_error at 0x787bd1c72170>,neg_mean_squared_error,pred,{},False,False
rmse,RMSE,RMSE,<function mean_squared_error at 0x787bd1c72170>,neg_root_mean_squared_error,pred,{'squared': False},False,False
r2,R2,R2,<function r2_score at 0x787bd1c72440>,r2,pred,{},True,False
rmsle,RMSLE,RMSLE,<function RMSLEMetricContainer.__init__.<local...,"make_scorer(root_mean_squared_log_error, great...",pred,{},False,False
mape,MAPE,MAPE,<function MAPEMetricContainer.__init__.<locals...,"make_scorer(mean_absolute_percentage_error, gr...",pred,{},False,False
r2_adj,R2 Adjusted,R2 Adjusted,<function r2_adjusted at 0x787b8df5add0>,make_scorer(r2_adjusted),pred,{},True,True


In [24]:
best = pr.compare_models(sort='R2 Adjusted', n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted,TT (Sec)
lightgbm,Light Gradient Boosting Machine,1.393,2.9036,1.704,0.1849,0.5611,0.6037,0.1848,1.411
catboost,CatBoost Regressor,1.3909,2.9041,1.7041,0.1848,0.5606,0.604,0.1847,9.024
gbr,Gradient Boosting Regressor,1.4053,2.9252,1.7103,0.1788,0.5641,0.6098,0.1787,22.529
xgboost,Extreme Gradient Boosting,1.398,2.943,1.7155,0.1738,0.5632,0.6068,0.1737,19.808
ridge,Ridge Regression,1.4831,3.2158,1.7933,0.0973,0.5859,0.6436,0.0972,0.466
br,Bayesian Ridge,1.4832,3.2158,1.7933,0.0973,0.5859,0.6435,0.0972,0.783
lr,Linear Regression,1.4831,3.2159,1.7933,0.0973,0.5859,0.6436,0.0972,0.62
ada,AdaBoost Regressor,1.512,3.2593,1.8053,0.085,0.6114,0.7053,0.0849,5.186
en,Elastic Net,1.5336,3.3707,1.8359,0.0538,0.602,0.6655,0.0537,0.622
rf,Random Forest Regressor,1.4751,3.3943,1.8424,0.0471,0.5912,0.6403,0.047,76.605


Processing:   0%|          | 0/87 [00:00<?, ?it/s]

In [25]:
best_tuned = [pr.tune_model(i, optimize='R2 Adjusted') for i in best]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.3915,2.8947,1.7014,0.1985,0.5592,0.5923,0.1984
1,1.3917,2.9011,1.7033,0.1899,0.5608,0.6019,0.1898
2,1.3871,2.8807,1.6973,0.1777,0.559,0.6078,0.1776
3,1.389,2.8784,1.6966,0.1915,0.5651,0.6048,0.1914
4,1.3992,2.9118,1.7064,0.1989,0.5633,0.6091,0.1988
5,1.4047,2.9584,1.72,0.1737,0.5588,0.6122,0.1736
6,1.4029,2.9305,1.7119,0.1682,0.5599,0.6129,0.1681
7,1.3932,2.9143,1.7071,0.1817,0.5612,0.6072,0.1816
8,1.3812,2.8576,1.6904,0.193,0.5632,0.5922,0.1929
9,1.3889,2.8812,1.6974,0.1834,0.5581,0.5922,0.1834


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.4019,2.9202,1.7089,0.1914,0.5618,0.598,0.1914
1,1.3953,2.911,1.7062,0.1871,0.5619,0.6056,0.187
2,1.3932,2.9047,1.7043,0.1708,0.5613,0.6128,0.1708
3,1.3938,2.884,1.6982,0.1899,0.5662,0.6082,0.1898
4,1.4068,2.9392,1.7144,0.1913,0.5655,0.615,0.1913
5,1.4114,2.9729,1.7242,0.1697,0.5608,0.6187,0.1696
6,1.4074,2.9454,1.7162,0.164,0.5617,0.6153,0.1639
7,1.3987,2.9255,1.7104,0.1785,0.5627,0.6111,0.1785
8,1.3879,2.8761,1.6959,0.1878,0.5651,0.5952,0.1877
9,1.3986,2.9154,1.7075,0.1737,0.5611,0.5977,0.1737


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.3992,2.9138,1.707,0.1932,0.561,0.5973,0.1931
1,1.3977,2.9155,1.7075,0.1859,0.5626,0.607,0.1858
2,1.3914,2.8947,1.7014,0.1737,0.5605,0.6117,0.1736
3,1.3956,2.8932,1.7009,0.1874,0.5668,0.6087,0.1873
4,1.4049,2.9254,1.7104,0.1951,0.5647,0.6124,0.1951
5,1.4074,2.9587,1.7201,0.1736,0.5595,0.6153,0.1735
6,1.4037,2.9343,1.713,0.1671,0.5604,0.615,0.167
7,1.3986,2.926,1.7106,0.1784,0.5627,0.6094,0.1783
8,1.3864,2.8685,1.6937,0.19,0.5643,0.5952,0.1899
9,1.3937,2.8938,1.7011,0.1799,0.5592,0.5951,0.1798


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [26]:
best_blended = pr.blend_models(best_tuned, optimize='R2 Adjusted')
best_stacked = pr.stack_models(best_tuned, optimize='R2 Adjusted')

best_overall_model = pr.automl(optimize='R2 Adjusted')
best_overall_model

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.3912,2.8903,1.7001,0.1997,0.5589,0.5935,0.1996
1,1.3909,2.8971,1.7021,0.191,0.5607,0.6028,0.1909
2,1.3865,2.8782,1.6965,0.1784,0.5592,0.6088,0.1783
3,1.3897,2.8763,1.696,0.1921,0.565,0.6054,0.192
4,1.3995,2.9098,1.7058,0.1994,0.5632,0.6099,0.1993
5,1.4028,2.9476,1.7169,0.1767,0.5584,0.6129,0.1766
6,1.4002,2.9239,1.71,0.1701,0.5594,0.6121,0.17
7,1.3934,2.9122,1.7065,0.1823,0.5613,0.6073,0.1822
8,1.3808,2.8525,1.6889,0.1945,0.5627,0.5927,0.1944
9,1.3878,2.8752,1.6957,0.1851,0.5575,0.5924,0.185


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.3892,2.8871,1.6991,0.2006,0.5584,0.5926,0.2005
1,1.3895,2.8955,1.7016,0.1915,0.5604,0.6021,0.1914
2,1.3856,2.8774,1.6963,0.1786,0.5589,0.6082,0.1785
3,1.3877,2.8742,1.6954,0.1927,0.5645,0.6045,0.1926
4,1.3981,2.9094,1.7057,0.1995,0.563,0.6093,0.1994
5,1.4018,2.95,1.7176,0.1761,0.5584,0.6122,0.176
6,1.3998,2.9264,1.7107,0.1694,0.5594,0.6116,0.1693
7,1.3918,2.9112,1.7062,0.1826,0.561,0.6069,0.1825
8,1.3795,2.8517,1.6887,0.1947,0.5625,0.5923,0.1946
9,1.3868,2.8746,1.6955,0.1853,0.5574,0.5919,0.1852


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

# <a id='toc4_'></a>[Create predictions](#toc0_)

In [27]:
predict_holdout = pr.predict_model(best_overall_model)
predict_holdout

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,R2 Adjusted
0,Stacking Regressor,1.3903,2.9069,1.705,0.1839,0.5567,0.6087,0.1839


Unnamed: 0,GOAL,DISABLE_COMMUNICATION,COUNTRY,CURRENCY,CREATE_LAUNCH_HOURS,CREATE_LAUNCH_HOURS_LOG,CREATE_DEADLINE_HOURS,CREATE_DEADLINE_HOURS_LOG,LAUNCHED_DEADLINE_HOURS,BACKERS_COUNT_LOG,prediction_label
31731,200.0,False,US,USD,213.207230,5.366944,665.262207,6.501683,452.054993,2.197225,2.138360
34183,100000.0,False,US,USD,19.928888,3.041131,1458.928833,7.286143,1439.000000,0.693147,1.248211
71977,225.0,False,US,USD,7.847778,2.180166,511.847778,6.239979,504.000000,0.000000,2.288395
28759,50000.0,False,US,USD,618.262512,6.428529,2059.262451,7.630589,1441.000000,3.526361,2.484335
50434,5000.0,False,US,USD,614.057495,6.421716,1334.057495,7.196730,720.000000,1.098612,3.050317
...,...,...,...,...,...,...,...,...,...,...,...
94874,8000.0,False,US,USD,431.720276,6.070092,1103.720337,7.007348,672.000000,0.693147,3.941149
50473,75000.0,False,US,USD,49.093056,3.913882,816.027771,6.705673,766.934692,0.000000,2.601259
50943,75000.0,False,GB,GBP,1822.031616,7.508256,2543.031738,7.841506,721.000000,2.564949,3.717432
46601,3975.0,False,US,USD,2406.215332,7.786226,3152.440186,8.056250,746.224976,3.496508,3.889069


In [30]:
predict_holdout['prediction_label_exp'] = np.expm1(predict_holdout['prediction_label'])
predict_holdout

Unnamed: 0,GOAL,DISABLE_COMMUNICATION,COUNTRY,CURRENCY,CREATE_LAUNCH_HOURS,CREATE_LAUNCH_HOURS_LOG,CREATE_DEADLINE_HOURS,CREATE_DEADLINE_HOURS_LOG,LAUNCHED_DEADLINE_HOURS,BACKERS_COUNT_LOG,prediction_label,prediction_label_exp
31731,200.0,False,US,USD,213.207230,5.366944,665.262207,6.501683,452.054993,2.197225,2.138360,7.485512
34183,100000.0,False,US,USD,19.928888,3.041131,1458.928833,7.286143,1439.000000,0.693147,1.248211,2.484106
71977,225.0,False,US,USD,7.847778,2.180166,511.847778,6.239979,504.000000,0.000000,2.288395,8.859100
28759,50000.0,False,US,USD,618.262512,6.428529,2059.262451,7.630589,1441.000000,3.526361,2.484335,10.993139
50434,5000.0,False,US,USD,614.057495,6.421716,1334.057495,7.196730,720.000000,1.098612,3.050317,20.122032
...,...,...,...,...,...,...,...,...,...,...,...,...
94874,8000.0,False,US,USD,431.720276,6.070092,1103.720337,7.007348,672.000000,0.693147,3.941149,50.477716
50473,75000.0,False,US,USD,49.093056,3.913882,816.027771,6.705673,766.934692,0.000000,2.601259,12.480696
50943,75000.0,False,GB,GBP,1822.031616,7.508256,2543.031738,7.841506,721.000000,2.564949,3.717432,40.158582
46601,3975.0,False,US,USD,2406.215332,7.786226,3152.440186,8.056250,746.224976,3.496508,3.889069,47.865354


In [31]:
predict_holdout.index

Int64Index([31731, 34183, 71977, 28759, 50434,  1738, 99811, 85613, 91671,
            46990,
            ...
            94809, 17494, 56667, 10670, 12073, 94874, 50473, 50943, 46601,
            33942],
           dtype='int64', length=21626)

In [32]:
true_backers = pd.read_pickle(filepath)['BACKERS_COUNT']
true_backers

0          4
1         41
2          1
3          0
4          0
          ..
108124    24
108125     0
108126    36
108127     0
108128    14
Name: BACKERS_COUNT, Length: 108129, dtype: int64

In [35]:
pd.concat([
    true_backers.iloc[predict_holdout.index],
    predict_holdout['prediction_label_exp']
], axis=1)


Unnamed: 0,BACKERS_COUNT,prediction_label_exp
31731,8,7.485512
34183,1,2.484106
71977,0,8.859100
28759,33,10.993139
50434,2,20.122032
...,...,...
94874,1,50.477716
50473,0,12.480696
50943,12,40.158582
46601,32,47.865354


# <a id='toc5_'></a>[Save to files](#toc0_)

In [37]:
predict_holdout.to_pickle('/kaggle/working/predict_holdout.pkl')
pr.save_model(best_overall_model, '/kaggle/working/saved_best_overall_model_stack')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['GOAL', 'CREATE_LAUNCH_HOURS',
                                              'CREATE_LAUNCH_HOURS_LOG',
                                              'CREATE_DEADLINE_HOURS',
                                              'CREATE_DEADLINE_HOURS_LOG',
                                              'LAUNCHED_DEADLINE_HOURS'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['COUNTRY', 'CURRENCY'],
                                     transfor...
                                                 <catboost.core.CatBoostRegressor object at 0x787bc4e5f850>),
                                                ('Gradient Boosting Regressor',
                                                 GradientBoostingRegressor(learning_rate=0.05,
                           