# Extracting info from blueprints through custom tasks and composable ML

## Set up

In [13]:
import datarobot as dr
import pandas as pd

RANDOM_SEED = 321

In [14]:
client = dr.Client(config_path = "/Volumes/GoogleDrive/My Drive/Configurations/drconfig_staging.yaml")

## Create or get model to work from

In [15]:
# To read from a local file, uncomment and use:
# df = pd.read_csv('./data/DR_Demo_AML_Alert.csv')

# To read from an s3 bucket:
df = pd.read_csv(
    "https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_AML_Alert.csv"
)
df.head()

Unnamed: 0,ALERT,SAR,kycRiskScore,income,tenureMonths,creditScore,state,nbrPurchases90d,avgTxnSize90d,totalSpend90d,...,indCustReqRefund90d,totalRefundsToCust90d,nbrPaymentsCashLike90d,maxRevolveLine,indOwnsHome,nbrInquiries1y,nbrCollections3y,nbrWebLogins90d,nbrPointRed90d,PEP
0,1,0,3,110300.0,5,757,PA,10,153.8,1538.0,...,1,45.82,5,6000,0,3,0,6,1,0
1,1,0,2,107800.0,6,715,NY,22,1.59,34.98,...,1,67.4,0,10000,1,3,0,87,0,0
2,1,0,1,74000.0,13,751,MA,7,57.64,403.48,...,1,450.69,0,10000,0,3,0,6,0,0
3,1,0,0,57700.0,1,659,NJ,14,29.52,413.28,...,1,71.43,0,8000,1,5,0,7,2,0
4,1,0,1,59800.0,3,709,PA,54,115.77,6251.58,...,1,2731.39,3,7000,1,1,0,8,1,0


In [29]:
do_training = False
project_id = "62bb09db63886fb9c1668bff"

In [30]:
if do_training:
    # Create a project by uploading data. It will take a few moments.
    project = dr.Project.create(
        sourcedata=df,
        project_name="DR_Demo_API_alert_AML_simple_{}".format(
            pd.datetime.now().strftime("%Y-%m-%d %H:%M")
        ),
    )
    # Set the project's target and initiate AutoPilot in Quick mode
    project.set_target(
        target="SAR",
        mode="quick",
        worker_count=4,
        advanced_options=dr.AdvancedOptions(seed=RANDOM_SEED),
    )

    # Open project's Leaderboard to monitor the progress in UI
    #project.open_leaderboard_browser()

    # Wait for AutoPilot to finish. You can set verbosity to 0 if you do not wish to see progress updates
    #project.wait_for_autopilot(verbosity=1)
else:
    # To access an existing project set your project ID below
    project = dr.Project.get(project_id)

print(
    "Custom Project URL: " + "https://staging.datarobot.com/projects/" + project.id + "/eda"
)
print("Custom Project ID: " + project.id)

Custom Project URL: https://staging.datarobot.com/projects/62bb09db63886fb9c1668bff/eda
Custom Project ID: 62bb09db63886fb9c1668bff


## Extract blueprint from existing model in code through GUI
https://staging.datarobot.com/projects/62bb09db63886fb9c1668bff/models/62bb0b5313653002cba74a56/blueprint
![view code in GUI](images/view_code_in_gui.png)

In [31]:
from datarobot_bp_workshop import Workshop
blueprint_id = '2130d16744eed608647655972974b6a6'

w = Workshop(project_id = project_id)
blueprint_graph = w.clone(blueprint_id=blueprint_id, project_id=project_id)
source_code = blueprint_graph.to_source_code(to_stdout=True)

w = Workshop(user_blueprint_id='62bc05dee13965db36397b1d', project_id='62bb09db63886fb9c1668bff')

csrnotes = w.Features.csrNotes

pnia4 = w.Tasks.PNIA4(w.TaskInputs.NUM)

ordcat2 = w.Tasks.ORDCAT2(w.TaskInputs.CAT)
ordcat2.set_task_parameters(m='random')

wngec2 = w.Tasks.WNGEC2(csrnotes, output_method=w.TaskOutputMethod.STACK)
wngec2.set_task_parameters(bi=True, a=0, lc=True, madf=0.8, midf=2, nrm='l2', num=[1, 2], tol=0, uidf=False)

rfc = w.Tasks.RFC(wngec2, pnia4, ordcat2)
rfc.set_task_parameters(e='RandomForestClassifier', mf=[0.2, 0.3, 0.4], ml=2000, ls=[5, 10, 20])

rfc_blueprint = w.BlueprintGraph(rfc, name='RandomForest Classifier (Gini)')


## Create a custom task to dump output if it does not yet exist

See notebook `create_new_custom_task.ipynb`

Come back when you're done

In [27]:
w.search_tasks('dump')

dump_output: [CUSTOMT_62bb237ddfb81eb3d30e8ba7] 
  - (No description)

In [23]:
dump_ct = w.CustomTasks.CUSTOMT_62bb237ddfb81eb3d30e8ba7

In [32]:
w = Workshop(project_id=project_id)

csrnotes = w.Features.csrNotes

pnia4 = w.Tasks.PNIA4(w.TaskInputs.NUM)
pnia4 = dump_ct(pnia4)

ordcat2 = w.Tasks.ORDCAT2(w.TaskInputs.CAT)
ordcat2.set_task_parameters(m='random')

wngec2 = w.Tasks.WNGEC2(csrnotes, output_method=w.TaskOutputMethod.STACK)
wngec2.set_task_parameters(bi=True, a=0, lc=True, madf=0.8, midf=2, nrm='l2', num=[1, 2], tol=0, uidf=False)

rfc = w.Tasks.RFC(wngec2, pnia4, ordcat2)
rfc.set_task_parameters(e='RandomForestClassifier', mf=[0.2, 0.3, 0.4], ml=2000, ls=[5, 10, 20])

rfc_blueprint = w.BlueprintGraph(rfc, name='RandomForest Classifier (Gini) with output dump')

Get some information about tasks if you're interested

In [33]:
# List tasks registered in DataRobot
w.list_categories(show_tasks=True)

[34mCustom[0m

  - dump_output (CUSTOMT_62bb237ddfb81eb3d30e8ba7)
[34mPreprocessing[0m

  [34mNumeric Preprocessing[0m

    [34mData Quality[0m

      - Missing Values Imputed (arbitrary or quick median) (PNIA4)
      - Numeric Data Cleansing (NDC)
    [34mDimensionality Reducer[0m

      - Truncated Singular Value Decomposition (SVD2)
      - Partial Principal Components Analysis (PPCA)
      - Truncated Singular Value Decomposition (SVD)
    [34mScaling[0m

      - Log Transformer (LOGT)
      - Transparent Search for best transformation (BTRANSF6T)
      - Search for best transformation including Smooth Ridit (BTRANSF6)
      - Transform on the link function scale (LINK)
      - Impose Uniform Transform (UNIF3)
      - Ridit Transform (SRDT3)
      - Smooth Ridit Transform (RDT5)
      - Standardize (ST)
      - Standardize (RST)
    - Normalizer (NORM)
    - One-Hot Encoding (PDM3)
    - Constant Splines (GS)
    - Search for ratios (RATIO3)
    - Binning of numerical v



In [34]:
# Understand what each task is doing
print(w.Tasks.WNGEC2.documentation())
help(w.Tasks.WNGEC2)

https://staging.datarobot.com/model-docs/tasks/WNGEC2-Auto-Tuned-N-Gram-Text-Modeler-using-token-counts.html
Help on WNGEC2 in module datarobot_bp_workshop.factories object:

class WNGEC2(datarobot_bp_workshop.friendly_repr.FriendlyRepr)
 |  Auto-Tuned N-Gram Text Classifier using token counts
 |  
 |  Tunes word n-grams and generates out-of-sample predictions
 |  
 |  Parameters
 |  ----------
 |  output_method: string, one of (TaskOutputMethod.PREDICT, TaskOutputMethod.STACK, TaskOutputMethod.PREDICT_MARGIN, TaskOutputMethod.STACK_MARGIN).
 |  task_parameters: dict, which may contain:
 |  
 |    analyzer (analyzer): select, (Default='word')
 |      Possible Values: ['word', 'char']
 |  
 |    binary (bi): select, (Default=0)
 |      Possible Values: [False, True]
 |  
 |    decode_error (de): select, (Default=2)
 |      Possible Values: ['strict', 'ignore', 'replace']
 |  
 |    encoding (enc): select, (Default=0)
 |      Possible Values: ['utf-8', 'latin-1']
 |  
 |    enet_alpha (a

Save as new blueprint

In [35]:
save_blueprint = False

In [36]:
if save_blueprint:
    rfc_blueprint.save()

Now you can see this custom blueprint appear:

https://staging.datarobot.com/ai-catalog/user-blueprints/62bb257b6ebefa1737fe070a/graph
![view code in GUI](images/blueprint_with_dump_ct.png)

## Run the new blueprint

You can do this through the GUI or through CMD

## Investigate results

We can now load the dumped file by following [this link](https://docs.datarobot.com/en/docs/modeling/special-workflows/cml/cml-custom-tasks.html#download-training-artifacts) and saving the extracted files here

![view code in GUI](images/download_artifacts.png)

In [37]:
import pickle
with open("artifact-62bb29ede908f3739ffe0765/dumped.pkl", 'rb') as fp:
    dumped_file = pd.read_pickle(fp)


In [38]:
dumped_file

Unnamed: 0,kycRiskScore,income,tenureMonths,creditScore,nbrPurchases90d,avgTxnSize90d,totalSpend90d,nbrDistinctMerch90d,nbrMerchCredits90d,nbrMerchCreditsRndDollarAmt90d,...,overpaymentInd90d,nbrCustReqRefunds90d,totalRefundsToCust90d,nbrPaymentsCashLike90d,maxRevolveLine,indOwnsHome,nbrInquiries1y,nbrCollections3y,nbrWebLogins90d,nbrPointRed90d
0,0.0,52100.0,1.0,671.0,23.0,314.26,7227.98,10.0,2.0,2.0,...,0.0,1.0,45.74,3.0,20000.0,1.0,3.0,1.0,7.0,1.0
1,1.0,40200.0,1.0,772.0,2.0,487.43,974.86,0.0,0.0,0.0,...,0.0,1.0,40.09,0.0,20000.0,1.0,2.0,0.0,5.0,1.0
2,2.0,71700.0,3.0,630.0,29.0,7.98,231.42,18.0,1.0,0.0,...,0.0,1.0,40.56,0.0,8000.0,0.0,5.0,0.0,8.0,2.0
3,1.0,100200.0,31.0,717.0,172.0,333.17,57305.24,93.0,17.0,3.0,...,0.0,1.0,50.75,4.0,12000.0,1.0,3.0,0.0,5.0,1.0
4,1.0,21100.0,0.0,716.0,2.0,119.12,238.24,1.0,0.0,0.0,...,0.0,1.0,44.55,0.0,19000.0,1.0,2.0,0.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,0.0,11800.0,40.0,705.0,12.0,351.33,4215.96,5.0,2.0,1.0,...,0.0,1.0,75.91,6.0,11000.0,0.0,4.0,0.0,4.0,0.0
6396,1.0,13000.0,14.0,671.0,20.0,29.95,599.00,9.0,3.0,1.0,...,0.0,1.0,45.36,0.0,18000.0,1.0,2.0,0.0,4.0,0.0
6397,2.0,17800.0,82.0,651.0,2.0,125.20,250.40,1.0,0.0,0.0,...,0.0,1.0,23.89,0.0,13000.0,1.0,3.0,0.0,5.0,1.0
6398,1.0,23900.0,6.0,714.0,26.0,274.24,7130.24,9.0,1.0,0.0,...,0.0,1.0,59.13,3.0,17000.0,1.0,2.0,0.0,6.0,1.0


Contrast this to the training data:

In [39]:
df.head()

Unnamed: 0,ALERT,SAR,kycRiskScore,income,tenureMonths,creditScore,state,nbrPurchases90d,avgTxnSize90d,totalSpend90d,...,indCustReqRefund90d,totalRefundsToCust90d,nbrPaymentsCashLike90d,maxRevolveLine,indOwnsHome,nbrInquiries1y,nbrCollections3y,nbrWebLogins90d,nbrPointRed90d,PEP
0,1,0,3,110300.0,5,757,PA,10,153.8,1538.0,...,1,45.82,5,6000,0,3,0,6,1,0
1,1,0,2,107800.0,6,715,NY,22,1.59,34.98,...,1,67.4,0,10000,1,3,0,87,0,0
2,1,0,1,74000.0,13,751,MA,7,57.64,403.48,...,1,450.69,0,10000,0,3,0,6,0,0
3,1,0,0,57700.0,1,659,NJ,14,29.52,413.28,...,1,71.43,0,8000,1,5,0,7,2,0
4,1,0,1,59800.0,3,709,PA,54,115.77,6251.58,...,1,2731.39,3,7000,1,1,0,8,1,0


You'll see that the other data types were filtered out, and that missing values were imputed.

## Possible extensions

- Add parameters to specify file name
- Save in other formats than pickle
- enable non-tabular data