## CAI demo notebook
For demo purposes.

In [None]:
# basic import statements
from predictnow.pdapi import PredictNowClient
import os
import pandas as pd
import requests
import json

In [None]:
# Basic configuration
# User ID
username = "variable29"    # "variable29", only letters, numbers, or underscores
email = "variable29@gmail.com"    # "variable29@gmail.com"

# connect to API
makeshiftapi_host = "http://127.0.0.1:5000/"      # local host for debugging purposes only

api_key = "--------"
client = PredictNowClient(makeshiftapi_host,api_key)

In [5]:
import pandas as pd
import os
# load and process data
input_filename = "ETF_return.csv"   # pre-processed to shift the date by one role
df_return = pd.read_csv(os.path.join(".", "Data", input_filename), parse_dates=["date"])

# CAI predicts the sign of a single return stream, and any additional columns will be taken as features
# so we will only keep SPY in this demo
target_label = "SPY"
df_return = df_return[["date", target_label]].iloc[1:].copy()

# train test split. Here we will use all data between 2018 and 2023 as training data set to predict returns of 2023
# you can change the training time window and testing period accordingly
# they are actually sent together as on file to the API, but we do need to determine their sizes.
df_train = df_return.loc[(df_return["date"] >= pd.to_datetime("2018-01-01")) & (df_return["date"] <= pd.to_datetime("2022-12-31"))]
df_test = df_return.loc[df_return["date"] >= pd.to_datetime("2023-01-01")]
df_input = pd.concat([df_train, df_test])

# finally a quick snap on the data
print(df_input)
print(str(len(df_test)))

           date       SPY
2013 2018-01-02  0.007157
2014 2018-01-03  0.006325
2015 2018-01-04  0.004215
2016 2018-01-05  0.006664
2017 2018-01-08  0.001829
...         ...       ...
3309 2023-02-27  0.003406
3310 2023-02-28 -0.003696
3311 2023-03-01 -0.003836
3312 2023-03-02  0.007777
3313 2023-03-03  0.016038

[1301 rows x 2 columns]
42


In [23]:
# model configuration
model_name = "DemoModel"
params= {
    "timeseries": "no",          # (yes, no)
    "type": "classification",    # (classification, regression)
    "feature_selection": "shap", # (shap, cmda, none)
    "analysis": "small",         # (small, none)
    "boost": "gbdt",             # (dart, gbdt)
    "mode": "train",             # (train, live)
    "testsize": str(len(df_test)),   # testsize < 1 --> ratio, > 1 --> exact # of rows, we are using the size of df_test here
    "weights": "no",             # (yes, no, custom)
    "prob_calib": "no",          # (yes, no) -> refine your probability
    "eda": "no",                # (yes, no) -> exploratory data analysis
    "random_seed":"1",    # random seed for initialization, default=1
    "custom_weights":"",
    "pre_engg_features_list": ["all"]  #['TR', 'CANARY', 'NOPE', 'OF'], Comment this param out in case it is not required to be used.
    #cmda added features
    # "cmda_corr_method":"PEARSON", # pearson,kendall,spearman
    # "cmda_n_clusters":"3",
    # "cmda_select_top_n_clusters":"4",
    #"mandatory_features":['sma_5'],
}
df_input.name = model_name

In [24]:
# create model and send training request
response = client.create_model(
    username=username, 
    model_name=model_name,
    params=params,
)
print("Response - create_model")
print(response)

Response - create_model
{'message': 'Successfully stored the model', 'success': True, 'model_name': 'DemoModel'}


In [25]:
# start training, there will be an error message!
response = client.train(
    model_name=model_name,
    input_df=df_input,
    label=target_label,
    username=username,
    email=email,
    #external_feature=True
)
print('Response - train')
print(response)

False
in false
return_op True
Response - train
{'message': 'Training the model is successfully requested.', 'model_name': 'saved_model_DemoModel.pkl', 'success': True, 'train_id': '982d529c-d654-4cfc-a8de-e52e0fb16df0'}


In [31]:
# check status
status = client.getstatus(
    username=username,
    train_id=response["train_id"]
)
print("Current status:")
print(status)
# cannot move on unter experiment is finished

Current status:
{'status': 'Prediction completed! Experiment complete.', 'current': 6, 'datetime': '2023-12-19 16:45:07.795215', 'state': 'COMPLETED', 'result': 'Experiment complete', 'total': 6}


In [32]:
# check training is finished
assert status['state'] == "COMPLETED", "Please wait for the training to finish."

# load predictions
response = client.getresult(
        model_name=model_name,
        username=username,
    )
# the response contains several groups of results, as follow:

# predicted probability (float between 0 and 1) for validation/training data set, i.e. 2018 - 2022 in the demo experiment
# the last column notes the probability that it's a "1", i.e. positive return
predicted_prob_cv = pd.read_json(response.predicted_prob_cv)
print("predicted_prob_cv")
print(predicted_prob_cv)

# predicted probability (float between 0 and 1) for the testing data set, i.e. every row after 2023 in the demo experiment
predicted_prob_test = pd.read_json(response.predicted_prob_test)
print("predicted_prob_test")
print(predicted_prob_test)

# predicted label, 0 or 1, for validation/training data set. Classified as class 1 if probability > 0.5
predicted_targets_cv = pd.read_json(response.predicted_targets_cv)
print("predicted_targets_cv")
print(predicted_targets_cv)

# predicted label, 0 or 1, for testing data set. Classified as class 1 if probability > 0.5
predicted_targets_test = pd.read_json(response.predicted_targets_test)
print("predicted_targets_test")
print(predicted_targets_test)

# feature importance score, shows what features are being used in the prediction
# more helpful when you include your features
# and only works when you set param['feature_selection'] to shap or cmda
if response.feature_importance:
    feature_importance = pd.read_json(response.feature_importance)
    print("feature_importance")
    print(feature_importance)

# performance metrics in terms of accuracies and so on
performance_metrics = pd.read_json(response.performance_metrics)
print("performance_metrics")
print(performance_metrics)


predicted_prob_cv
      Unnamed: 0       date       0.0       1.0
0              0 2018-01-02  0.006849  0.993151
1              1 2018-01-03  0.220018  0.779982
2              2 2018-01-04  0.156282  0.843718
3              3 2018-01-05  0.090680  0.909320
4              4 2018-01-08  0.430789  0.569211
...          ...        ...       ...       ...
1254        1254 2022-12-23  0.440742  0.559258
1255        1255 2022-12-27  0.029810  0.970190
1256        1256 2022-12-28  0.572378  0.427622
1257        1257 2022-12-29  0.077087  0.922913
1258        1258 2022-12-30  0.958594  0.041406

[1259 rows x 4 columns]
predicted_prob_test
         date       0.0       1.0
0  2023-01-03  0.907594  0.092406
1  2023-01-04  0.913679  0.086321
2  2023-01-05  0.357275  0.642725
3  2023-01-06  0.949131  0.050869
4  2023-01-09  0.708843  0.291157
5  2023-01-10  0.927519  0.072481
6  2023-01-11  0.583928  0.416072
7  2023-01-12  0.946364  0.053636
8  2023-01-13  0.335753  0.664247
9  2023-01-17  0.7089

In [33]:
# you can also save the results shown above to your disk, e.g.
predicted_prob_test.to_csv(os.path.join(".", "predicted_prob_test.csv"))

In [37]:
dir(response)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'eda_describe',
 'feature_importance',
 'lab_test',
 'performance_metrics',
 'predicted_prob_cv',
 'predicted_prob_test',
 'predicted_targets_cv',
 'predicted_targets_test',
 'success']