# Employee attrition sample using ML Pipelines

### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import oci
import ads
import os
from os import environ
from ads.catalog.project import ProjectCatalog

ads.set_auth(auth='resource_principal')

In [6]:
# make sure there is access to the project and compartment
config_1 = oci.config.from_file("config", "DEFAULT")
compartment_id = 'ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q'
pc = ProjectCatalog(compartment_id=compartment_id)
# pc.list_projects()

In [7]:
pc

<ads.catalog.project.ProjectCatalog at 0x7f78f2193b50>

Fill in your resources details:

In [8]:
project_id = 'ocid1.datascienceproject.oc1.iad.amaaaaaazv3jruqavly2t7bg75kg5vwfhbbrwueoaghqukysxloimy53zyxa'

In [9]:
log_group_id = "ocid1.loggroup.oc1.iad.amaaaaaazv3jruqadtnj7vcpme764ebg5w7bza4c2etpvtwqi2bbzd2hlnea"

In [10]:
data_location = "orcl_attrition.csv"  # use: 'oci://<bucket>@<workspace>/'

In [12]:
# create a data science client to communicate with the service
config = oci.config.from_file("config", "DEFAULT")
data_science_client = oci.data_science.DataScienceClient(config)

In [13]:
# list all pipelines in the project
res = data_science_client.list_pipelines(compartment_id=compartment_id)
print(res.data)

[{
  "compartment_id": "ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q",
  "created_by": "ocid1.saml2idp.oc1..aaaaaaaagfdxcwndgbraz2pbram2jyfowctsnjsyjfcqvjdfv27pcpl4q3za/arvind.ranganath.r@oracle.com",
  "defined_tags": {
    "Oracle-Tags": {
      "CreatedBy": "oracleidentitycloudservice/arvind.ranganath.r@oracle.com",
      "CreatedOn": "2024-04-22T10:08:34.485Z"
    }
  },
  "display_name": "AccountingCalendar2",
  "freeform_tags": {},
  "id": "ocid1.datasciencepipeline.oc1.iad.amaaaaaazv3jruqapywdkqpgqjshtvteixtjzclvp6wjt22nt2dou55iykxa",
  "lifecycle_state": "ACTIVE",
  "project_id": "ocid1.datascienceproject.oc1.iad.amaaaaaazv3jruqax6p5pprd54qnyffrelimbt7zpjnt3iqbovh4xb5mqbea",
  "system_tags": {},
  "time_created": "2024-04-22T10:08:34.951000+00:00",
  "time_updated": "2024-04-22T10:10:20.591000+00:00"
}, {
  "compartment_id": "ocid1.compartment.oc1..aaaaaaaaretksgipt3jgwfpzgh4ijyw54uynyfviaxs5li4wtl744fj4fi3q",
  "created_by": "ocid1.saml2i

In [14]:
from random import randrange
pipeline_name = f"pipeline_sample_employee-attrition-{randrange(1000,9999)}"

In [15]:
# create a new pipeline (service conda pack)
pipeline_payload = {
    "projectId": project_id,
    "compartmentId": compartment_id,
    "displayName": pipeline_name,
    "infrastructureConfigurationDetails": {
        "shapeName": "VM.Standard2.4",
        "blockStorageSizeInGBs": "50"
    },
    "logConfigurationDetails": {
        "enableLogging": True,
        "logGroupId": log_group_id,
        "enableAutoLogCreation": True   # log will be automatically created
    },
    "configurationDetails": {
        "type": "DEFAULT",
        "maximumRuntimeInMinutes": 60,
        "environmentVariables": {
            "CONDA_ENV_TYPE": "service",
            "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1"
        }
    },
    "stepDetails": [
        {
            "stepName": "data_processing",
            "description": "Import data, feature engineering, train-test split",
            "stepType": "CUSTOM_SCRIPT",
            "stepInfrastructureConfigurationDetails": {
                "shapeName": "VM.Standard2.4",
                "blockStorageSizeInGBs": "50"
            },
            "stepConfigurationDetails": {
                "type": "DEFAULT",
                "maximumRuntimeInMinutes": 30,
                "environmentVariables": {
                    "PIPELINE_STEP_RUN_ENTRYPOINT": "employee-attr-dataproc.py",
                    "CONDA_ENV_TYPE": "service",
                    "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1",
                    "DATA_LOCATION": data_location
                }
            }
        },
        {
            "stepName": "train_logistic_regression",
            "description": "Train a Logistic Regression model and save to the model catalog with its AUC score",
            "stepType": "CUSTOM_SCRIPT",
            "stepInfrastructureConfigurationDetails": {
                "shapeName": "VM.Standard2.4",
                "blockStorageSizeInGBs": "50"
            },
            "stepConfigurationDetails": {
                "type": "DEFAULT",
                "maximumRuntimeInMinutes": 120,
                "environmentVariables": {
                    "PIPELINE_STEP_RUN_ENTRYPOINT": "employee-attr-train-lr.py",
                    "CONDA_ENV_TYPE": "service",
                    "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1",
                    "DATA_LOCATION": data_location
                }
            },
            "dependsOn": ["data_processing"]
        },
        {
            "stepName": "train_random_forest",
            "description": "Train a Random Forest model and save to the model catalog with its AUC score",
            "stepType": "CUSTOM_SCRIPT",
            "stepInfrastructureConfigurationDetails": {
                "shapeName": "VM.Standard2.4",
                "blockStorageSizeInGBs": "50"
            },
            "stepConfigurationDetails": {
                "type": "DEFAULT",
                "maximumRuntimeInMinutes": 120,
                "environmentVariables": {
                    "PIPELINE_STEP_RUN_ENTRYPOINT": "employee-attr-train-rf.py",
                    "CONDA_ENV_TYPE": "service",
                    "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1",
                    "DATA_LOCATION": data_location
                }
            },
            "dependsOn": ["data_processing"]
        },
        {
            "stepName": "train_xgboost",
            "description": "Train a model with XGBoost and save to the model catalog with its AUC score",
            "stepType": "CUSTOM_SCRIPT",
            "stepInfrastructureConfigurationDetails": {
                "shapeName": "VM.Standard2.4",
                "blockStorageSizeInGBs": "50"
            },
            "stepConfigurationDetails": {
                "type": "DEFAULT",
                "maximumRuntimeInMinutes": 120,
                "environmentVariables": {
                    "PIPELINE_STEP_RUN_ENTRYPOINT": "employee-attr-train-xgb.py",
                    "CONDA_ENV_TYPE": "service",
                    "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1",
                    "DATA_LOCATION": data_location
                }
            },
            "dependsOn": ["data_processing"]
        },
        {
            "stepName": "evaluate_and_deploy",
            "description": "find the best model by their AUC score and deploy",
            "stepType": "CUSTOM_SCRIPT",
            "stepInfrastructureConfigurationDetails": {
                "shapeName": "VM.Standard2.4",
                "blockStorageSizeInGBs": "50"
            },
            "stepConfigurationDetails": {
                "type": "DEFAULT",
                "maximumRuntimeInMinutes": 30,
                "environmentVariables": {
                    "PIPELINE_STEP_RUN_ENTRYPOINT": "employee-attr-eval-deploy.py",
                    "CONDA_ENV_TYPE": "service",
                    "CONDA_ENV_SLUG": "pypgx2340_p38_cpu_v1",
                    "DATA_LOCATION": data_location
                }
            },
            "dependsOn": ["train_logistic_regression", "train_random_forest", "train_xgboost"]
        }
    ],
    "freeformTags": {
        "freeTags": "employee-attrition-sample"
    }
}
pipeline_res = data_science_client.create_pipeline(pipeline_payload)
pipeline_id = pipeline_res.data.id

In [16]:
print(pipeline_id)

ocid1.datasciencepipeline.oc1.iad.amaaaaaazv3jruqas4rcsfty4z2cugxjggvsniftonio4mlvo2tyubguf3oq


In [17]:
# the pipeline will be in CREATING state until all steps have their artifacts uploaded
print(pipeline_res.data.lifecycle_state)

CREATING


In [18]:
steps_names_and_artifacts = {"data_processing":"employee-attr-dataproc.zip",
                             "train_logistic_regression":"employee-attr-train-lr.zip",
                             "train_random_forest":"employee-attr-train-rf.zip",
                             "train_xgboost":"employee-attr-train-xgb.zip",
                             "evaluate_and_deploy":"employee-attr-eval-deploy.zip"}

In [19]:
# upload steps artifacts
for name in steps_names_and_artifacts:
    fl = open(steps_names_and_artifacts[name], "rb")
    ret = data_science_client.create_step_artifact(pipeline_id, name, fl, content_disposition=f"attachment; filename={steps_names_and_artifacts[name]}")
    print("OK" if ret.status==204 else ret.status)  # 204 is ok

OK
OK
OK
OK
OK


In [20]:
# pipeline should be in ACTIVE state now
res = data_science_client.get_pipeline(pipeline_id)
print(res.data.lifecycle_state)

ACTIVE


## Run the pipeline

In [21]:
# set the configuration and the environment variables for the run
pipeline_run_name = f"pipeline-run-{randrange(1000,9999)}"

In [22]:
pipeline_run_payload = {
    "projectId": project_id,
    "displayName": pipeline_run_name,
    "pipelineId": pipeline_id,
    "compartmentId": compartment_id,
    "configurationOverrideDetails": {
        "type": "DEFAULT",
        "environmentVariables": {
            "DATA_LOCATION": data_location,
            "SKIP_MODEL_DEPLOY": "True"         # change to "False" to deploy the best model
        }
    }
}

In [23]:
pipelinerun_res = data_science_client.create_pipeline_run(pipeline_run_payload)

In [24]:
# check pipeline run status
run_status = data_science_client.get_pipeline_run(pipelinerun_res.data.id)
print(run_status.data.lifecycle_state)

ACCEPTED


## Run the pipeline from the console UI

In [25]:
print("Ctrl-Click the hyperlink to open the pipeline run page in the OCI console UI")
print("https://cloud.oracle.com/data-science/pipelines/{}/pipeline-runs".format(pipeline_id))

Ctrl-Click the hyperlink to open the pipeline run page in the OCI console UI
https://cloud.oracle.com/data-science/pipelines/ocid1.datasciencepipeline.oc1.iad.amaaaaaazv3jruqas4rcsfty4z2cugxjggvsniftonio4mlvo2tyubguf3oq/pipeline-runs


#### Don't forget to set the environment varaibles when running the pipeline: DATA_LOCATION