In [97]:
import copy
import inspect
import os
import shutil
from datetime import date

import numpy as np
import pandas as pd
import plotly.express as px
from datasets import Dataset
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

from cyclops.data.slicer import SliceSpec
from cyclops.evaluate.metrics import create_metric
from cyclops.evaluate.metrics.experimental.metric_dict import MetricDict
from cyclops.report import ModelCardReport
from cyclops.report.plot.classification import ClassificationPlotter
from cyclops.report.utils import flatten_results_dict


## Report Genration for Heart Failure Prediction
Here's an example to demonstrate how we can generate a report as we proceed through all the steps to train and evaluate a model. For this purpose, we are going to use Kaggle's heart prediction failure dataset and gradually populate the report with information about dataset, model and results.

## Create Model Card Report
First, we should create a `ModelCardReport` object to fill in the fields and sections after training.

In [2]:
report = ModelCardReport()

In [3]:
# Constants
DATA_DIR = "./data"
RANDOM_SEED = 21

## Data Loading
Before starting, make sure to install the Kaggle API by running pip install kaggle. To use the Kaggle API, you need to sign up for a Kaggle account at https://www.kaggle.com. Then go to the ‘Account’ tab of your user profile (https://www.kaggle.com/<username>/account) and select ‘Create API Token’. This will trigger the download of kaggle.json, a file containing your API credentials. Place this file in the location ~/.kaggle/kaggle.json on your machine.

In [4]:
api = KaggleApi()
api.authenticate()
api.dataset_download_files(
    "fedesoriano/heart-failure-prediction",
    path=DATA_DIR,
    unzip=True,
)



In [5]:
df = pd.read_csv(os.path.join(DATA_DIR, "heart.csv"))
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
df.describe().T

In [6]:
fig = px.pie(df, names="Sex")

fig.update_layout(
    title="Sex Distribution",
)

fig.show()

## Adding figures to report
We can add figures and diagrams to report. We can define caption and the section of the report that this figure belongs to. Since we are exploring the distribution of different features in the dataset, we add it to `datasets` section:

In [7]:
report.log_plotly_figure(
    fig=fig,
    caption="Sex Distribution",
    section_name="datasets",
)

### Age distribution figure
We plot a histogram of ages similarly and add the figure to our report:

In [8]:
fig = px.histogram(df, x="Age")
fig.update_layout(
    title="Age Distribution",
    xaxis_title="Age",
    yaxis_title="Count",
    bargap=0.2,
)

fig.show()

### Outcome distribution
Plot outcome distribution and add it to report:

In [9]:
df["outcome"] = df["HeartDisease"].astype("int")
df = df.drop(columns=["HeartDisease"])

In [10]:
fig = px.pie(df, names="outcome")
fig.update_traces(textinfo="percent+label")
fig.update_layout(title_text="Outcome Distribution")
fig.update_traces(
    hovertemplate="Outcome: %{label}<br>Count: \
    %{value}<br>Percent: %{percent}",
)
fig.show()

In [11]:
report.log_plotly_figure(
    fig=fig,
    caption="Outcome Distribution",
    section_name="datasets",
)

In [12]:
class_counts = df["outcome"].value_counts()
class_ratio = class_counts[0] / class_counts[1]
print(class_ratio, class_counts)

0.8070866141732284 outcome
1    508
0    410
Name: count, dtype: int64


## Data Preprocessing

This dataset does not have any null values, so we can jump to feature scaling. The string data in the dataframe is in the form of object, we need to convert it back to string to work on it:

In [13]:
string_col = df.select_dtypes(include="object").columns
df[string_col]=df[string_col].astype("string")
string_col=df.select_dtypes("string").columns.to_list()

We are going to apply tree-based models to our data, so we use `LabelEncoder`:

In [22]:
target="outcome"

df_processed=pd.get_dummies(df,columns=string_col,drop_first=False)
df_processed.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,outcome,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True


In [20]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,outcome
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [23]:

feature_cols=df_processed.columns.to_list()
feature_cols.remove(target)

Before training, let's document dataset in the model card.
This can be done using the log_dataset method, which takes the following arguments: 

- `description`: A description of the dataset. 
- `citation`: The citation for the dataset. 
- `link`: A link to a resource for the dataset. 
- `license_id`: The SPDX license identifier for the dataset. 
- `version`: The version of the dataset. 
- `features`: A list of features in the dataset. 
- `split`: The split of the dataset (train, test, validation, etc.). 
- `sensitive_features`: A list of sensitive features used to train/evaluate the model. 
- `sensitive_feature_justification`: A justification for the sensitive features used to train/evaluate the model.


In [25]:

report.log_dataset(
    description="""This dataset was created by combining different datasets
    already available independently but not combined before. In this dataset,
    5 heart datasets are combined over 11 common features. Every dataset used
    can be found under the Index of heart disease datasets from UCI
    Machine Learning Repository on the following link:
    https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/.""",
    citation=inspect.cleandoc(
        """
        @misc{fedesoriano,
          title={Heart Failure Prediction Dataset.},
          author={Fedesoriano, F},
          year={2021},
          publisher={Kaggle}
        }
    """,
    ),
    link="""
    https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
    """,
    license_id="CC0-1.0",
    version="Version 1",
    features=df.columns.to_list().remove(target),
    sensitive_features=["Sex", "Age"],
    sensitive_feature_justification="Demographic information like age and gender \
        often have a strong correlation with health outcomes. For example, older \
        patients are more likely to have a higher risk of heart disease.",
)

## Create HuggingFace Dataset
We convert our processed Pandas dataframe into a Hugging Face dataset, for later evaluation by Cyclop metrics.

In [26]:
dataset = Dataset.from_pandas(df_processed)
dataset.cleanup_cache_files()
print(dataset)

Dataset({
    features: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'outcome', 'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up'],
    num_rows: 918
})


In [27]:
# Split dataframe into inputs and outputs
X, y = df_processed[feature_cols], df_processed[target]

## Training

In [28]:
# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_SEED,
)
classifier = LogisticRegression()
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred_prob = classifier.predict_proba(X_test)

## Evaluation
As demonstrated in evaluation tutorial, we define a metric dict:

In [73]:
metric_names = [
    "binary_accuracy",
    "binary_precision",
    "binary_recall",
    "binary_f1_score",
    "binary_roc_curve",
    "binary_auroc"
]
metrics = [
    create_metric(metric_name, experimental=True) for metric_name in metric_names
]
metric_collection = MetricDict(metrics)
metric_collection(y_test.values, np.float64(y_pred))

{'BinaryAccuracy': array(0.84782606, dtype=float32),
 'BinaryPrecision': array(0.88461536, dtype=float32),
 'BinaryRecall': array(0.8518519, dtype=float32),
 'BinaryF1Score': array(0.8679245, dtype=float32),
 'BinaryROC': ROCCurve(fpr=array([0.        , 0.15789473, 1.        ], dtype=float32), tpr=array([0.       , 0.8518519, 1.       ], dtype=float32), thresholds=array([1., 1., 0.])),
 'BinaryAUROC': 0.8469786}

## Data Slicing
In addition to overall metrics, it might be interesting to see how the model performs on certain subpopulation or subsets. We can define these subsets using SliceSpec objects.

In [30]:
spec_list = [
    {
        "Age": {
            "min_value": 30,
            "max_value": 50,
            "min_inclusive": True,
            "max_inclusive": False,
        },
    },
    {
        "Age": {
            "min_value": 50,
            "max_value": 70,
            "min_inclusive": True,
            "max_inclusive": False,
        },
    },
]
slice_spec = SliceSpec(spec_list)

Below, we are combining the raw features of the test data and the predictions so that we use them later for slice-specific evaluations.

In [58]:
# Get positions of matching indices in df
matching_positions = y_test.index.get_indexer(df.index)

# Select rows from df using matching positions (valid positions are non-negative)
df_test = df.iloc[matching_positions[matching_positions >= 0]]
df_test["preds"] = y_pred
df_test["preds_prob"] = y_pred_prob[:, 1]
df_test.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,outcome,preds,preds_prob
101,51,M,ASY,130,179,0,Normal,100,N,0.0,Up,0,0,0.29538
23,44,M,ATA,150,288,0,Normal,150,Y,3.0,Flat,1,1,0.902883
162,47,M,ATA,160,263,0,Normal,174,N,0.0,Up,0,1,0.920747
112,47,M,ASY,140,276,1,Normal,125,Y,0.0,Up,0,1,0.956746
165,46,M,TA,140,272,1,Normal,175,N,2.0,Flat,1,1,0.914457


### Age distribution in test data

In [44]:
fig = px.histogram(df_test, x="Age")
fig.update_layout(
    title="Age Distribution in Test Data",
    xaxis_title="Age",
    yaxis_title="Count",
    bargap=0.2,
)

fig.show()

## Logging metrics and results to report
Here, we gather evalutaions and add them to the report.

We can add a performance metric to the model card using the `log_performance_metric` method, which expects a dictionary where the keys are in the following format: `slice_name/metric_name`. For instance, `overall/accuracy` or `Age:[30 - 50)/BinaryPrecision`.

We first need to process the evaluation results to get the metrics in the right format.

In [74]:
from cyclops.evaluate import evaluator


# Create Dataset object
heart_failure_data = Dataset.from_pandas(df_test)

result = evaluator.evaluate(
    dataset=heart_failure_data,
    metrics=metric_collection,  # type: ignore[list-item]
    target_columns=target,
    prediction_columns="preds_prob",
    slice_spec=slice_spec,
)

Filter -> Age:[30 - 50):   0%|          | 0/184 [00:00<?, ? examples/s]

Filter -> Age:[50 - 70):   0%|          | 0/184 [00:00<?, ? examples/s]

Filter -> overall:   0%|          | 0/184 [00:00<?, ? examples/s]

In [83]:

results_flat = flatten_results_dict(
    results=result,
    remove_metrics=["BinaryROC"],
    model_name="model_for_preds_prob"
)

In [78]:
result

{'model_for_preds_prob': {'Age:[30 - 50)': {'BinaryAccuracy': array(0.6909722, dtype=float32),
   'BinaryPrecision': array(0.6566265, dtype=float32),
   'BinaryRecall': array(0.77304965, dtype=float32),
   'BinaryF1Score': array(0.71009773, dtype=float32),
   'BinaryROC': ROCCurve(fpr=array([0.        , 0.08163265, 0.08843537, 0.0952381 , 0.10204082,
          0.10884354, 0.11564626, 0.12244898, 0.1292517 , 0.1292517 ,
          0.13605443, 0.14285715, 0.14285715, 0.14965986, 0.15646258,
          0.1632653 , 0.17006803, 0.17687075, 0.18367347, 0.1904762 ,
          0.1904762 , 0.19727892, 0.19727892, 0.19727892, 0.20408164,
          0.21088435, 0.21768707, 0.2244898 , 0.2244898 , 0.23129252,
          0.23809524, 0.24489796, 0.25170067, 0.25170067, 0.2585034 ,
          0.26530612, 0.26530612, 0.26530612, 0.27210885, 0.27210885,
          0.27891156, 0.2857143 , 0.292517  , 0.2993197 , 0.30612245,
          0.31292516, 0.31292516, 0.3197279 , 0.3197279 , 0.3265306 ,
          0.33333

In [84]:
results_flat

{'Age:[30 - 50)/BinaryAccuracy': array(0.6909722, dtype=float32),
 'Age:[30 - 50)/BinaryPrecision': array(0.6566265, dtype=float32),
 'Age:[30 - 50)/BinaryRecall': array(0.77304965, dtype=float32),
 'Age:[30 - 50)/BinaryF1Score': array(0.71009773, dtype=float32),
 'Age:[30 - 50)/BinaryAUROC': 0.79712456,
 'Age:[50 - 70)/BinaryAccuracy': array(0.4556962, dtype=float32),
 'Age:[50 - 70)/BinaryPrecision': array(0.41463414, dtype=float32),
 'Age:[50 - 70)/BinaryRecall': array(0.4722222, dtype=float32),
 'Age:[50 - 70)/BinaryF1Score': array(0.44155845, dtype=float32),
 'Age:[50 - 70)/BinaryAUROC': 0.40633073,
 'overall/BinaryAccuracy': array(0.4293478, dtype=float32),
 'overall/BinaryPrecision': array(0.32692307, dtype=float32),
 'overall/BinaryRecall': array(0.49275362, dtype=float32),
 'overall/BinaryF1Score': array(0.39306358, dtype=float32),
 'overall/BinaryAUROC': 0.39382482}

We first need to process the evaluation results to get the metrics in the right format. The descriptions dictionary will appear as you hover on metrics in the report, so feel free to change them as it's appropriate for your usage.

In [85]:
for name, metric in results_flat.items():
    split, name = name.split("/")  # noqa: PLW2901
    descriptions = {
        "BinaryPrecision": "The proportion of predicted positive instances that are correctly predicted.",
        "BinaryRecall": "The proportion of actual positive instances that are correctly predicted. Also known as recall or true positive rate.",
        "BinaryAccuracy": "The proportion of all instances that are correctly predicted.",
        "BinaryAUROC": "The area under the receiver operating characteristic curve (AUROC) is a measure of the performance of a binary classification model.",
        "BinaryAveragePrecision": "The area under the precision-recall curve (AUPRC) is a measure of the performance of a binary classification model.",
        "BinaryF1Score": "The harmonic mean of precision and recall.",
    }
    report.log_quantitative_analysis(
        "performance",
        name=name,
        value=metric.tolist(),
        description=descriptions[name],
        metric_slice=split,
        pass_fail_thresholds=0.7,
        pass_fail_threshold_fns=lambda x, threshold: bool(x >= threshold),
    )

We can also use the `ClassificationPlotter`(as demonstrated in Evaluation example) to plot the performance metrics and the add the figure to the model card using the log_plotly_figure method.

In [67]:
plotter = ClassificationPlotter(task_type="binary", class_names=["0", "1"])
plotter.set_template("plotly_white")

In [86]:
# extracting the ROC curves and AUROC results for all the slices
model_name = "model_for_preds_prob"
roc_curves = {
    slice_name: slice_results["BinaryROC"]
    for slice_name, slice_results in result[model_name].items()
}
aurocs = {
    slice_name: slice_results["BinaryAUROC"]
    for slice_name, slice_results in result[model_name].items()
}

# plotting the ROC curves for all the slices
roc_plot = plotter.roc_curve_comparison(roc_curves, aurocs=aurocs)
report.log_plotly_figure(
    fig=roc_plot,
    caption="ROC Curve for All Patients",
    section_name="quantitative analysis",
)
roc_plot.show()

In [87]:
# Extracting the overall classification metric values.
overall_performance = {
    metric_name: metric_value
    for metric_name, metric_value in result[model_name]["overall"].items()
    if metric_name not in ["BinaryROC", "BinaryPrecisionRecallCurve"]
}
# Plotting the overall classification metric values.
overall_performance_plot = plotter.metrics_value(
    overall_performance,
    title="Overall Performance",
)
report.log_plotly_figure(
    fig=overall_performance_plot,
    caption="Overall Performance",
    section_name="quantitative analysis",
)
overall_performance_plot.show()

In [88]:
# Extracting the metric values for all the slices.
slice_metrics = {
    slice_name: {
        metric_name: metric_value
        for metric_name, metric_value in slice_results.items()
        if metric_name not in ["BinaryROC", "BinaryPrecisionRecallCurve"]
    }
    for slice_name, slice_results in result[model_name].items()
}
# Plotting the metric values for all the slices.
slice_metrics_plot = plotter.metrics_comparison_bar(slice_metrics)
report.log_plotly_figure(
    fig=slice_metrics_plot,
    caption="Slice Metric Comparison",
    section_name="quantitative analysis",
)
slice_metrics_plot.show()

## Report Generation

### ModelCard
First, let's document the model details section. A **ModelCard** has several **Sections** and each **Section** includes multiple **Fields**. Model details could be one of the sections in our model card, and it has the following fields by default:

- **`description`**: A high-level description of the model and its usage for a general audience. 
- **`version`**: The version of the model. 
- **`owners`**: The individuals or organizations that own the model. 
- **`license`**: The license under which the model is made available. 
- **`citation`**: The citation for the model. 
- **`references`**: Links to resources that are relevant to the model. 
- **`path`**: The path to where the model is stored. 
- **`regulatory_requirements`**: The regulatory requirements that are relevant to the model.

We can add additional fields to the model details section by passing a dictionary to the `log_from_dict` method and specifying the section name as `model_details`. You can also use the `log_descriptor` method to add a new field object with a description attribute to any section of the model card.

In [91]:

report.log_from_dict(
    data={
        "name": "Heart Failure Prediction Model",
        "description": "The model was trained on the Kaggle Heart Failure \
        Prediction Dataset to predict risk of heart failure.",
    },
    section_name="model_details",
)

report.log_version(
    version_str="0.0.1",
    date=str(date.today()),
    description="Initial Release",
)
report.log_owner(
    name="CyclOps Team",
    contact="vectorinstitute.github.io/cyclops/",
    email="cyclops@vectorinstitute.ai",
)
report.log_license(identifier="Apache-2.0")
report.log_reference(
    link="https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html",  # noqa: E501
)

### Considerations

Next, let’s populate the considerations section, which includes the following fields by default: 
- **`users`**: The intended users of the model. 
- **`use_cases`**: The use cases for the model. These could be primary, downstream or out-of-scope use cases. 
- **`fairness_assessment`**: A description of the benefits and harms of the model for different groups as well as the steps taken to mitigate the harms. 
- **`ethical_considerations`**: The risks associated with using the model and the steps taken to mitigate them. This can be populated using the log_risk method.

In [92]:
report.log_from_dict(
    data={
        "users": [
            {"description": "Hospitals"},
            {"description": "Clinicians"},
        ],
    },
    section_name="considerations",
)
report.log_user(description="ML Engineers")
report.log_use_case(
    description="Predicting risk of heart failure.",
    kind="primary",
)
report.log_use_case(
    description="Predicting risk of pathologies and conditions other\
    than heart failure.",
    kind="out-of-scope",
)
report.log_fairness_assessment(
    affected_group="sex, age",
    benefit="Improved health outcomes for patients.",
    harm="Biased predictions for patients in certain groups (e.g. older patients) \
        may lead to worse health outcomes.",
    mitigation_strategy="We will monitor the performance of the model on these groups \
        and retrain the model if the performance drops below a certain threshold.",
)
report.log_risk(
    risk="The model may be used to make decisions that affect the health of patients.",
    mitigation_strategy="The model should be continuously monitored for performance \
        and retrained if the performance drops below a certain threshold.",
)

### Exporting report
Once the model card is populated, you can generate the report using the `export` method. The report is generated in the form of an HTML file. A JSON file containing the model card data will also be generated along with the HTML file. By default, the files will be saved in a folder named `cyclops_reports` in the current working directory. You can change the path by passing a `output_dir` argument when instantiating the `ModelCardReport` class.

In [98]:
np.random.seed(42)

synthetic_timestamps = pd.date_range(
    start="1/1/2020", periods=10, freq="D"
).values.astype(str)


report._model_card.overview = None
report_path = report.export(
    output_filename="heart_failure_report_periodic.html",
    synthetic_timestamp=synthetic_timestamps[0],
    last_n_evals=3,
)

shutil.copy(f"{report_path}", ".")
metric_save = None
for i in tqdm(range(len(synthetic_timestamps[1:]))):
    if i == 3:
        report._model_card.quantitative_analysis.performance_metrics.append(
            metric_save,
        )
    report._model_card.overview = None
    for metric in report._model_card.quantitative_analysis.performance_metrics:
        metric.value = np.clip(
            metric.value + np.random.normal(0, 0.1),
            0,
            1,
        )
        metric.tests[0].passed = bool(metric.value >= 0.7)
    if i == 2:
        metrics = []
        for metric in report._model_card.quantitative_analysis.performance_metrics:
            if metric.type == "BinaryAccuracy" and metric.slice == "Age:[30 - 50)":
                metric_save = copy.deepcopy(metric)
            else:
                metrics.append(metric)
        report._model_card.quantitative_analysis.performance_metrics = metrics
    report_path = report.export(
        output_filename="heart_failure_report_periodic.html",
        synthetic_timestamp=synthetic_timestamps[i + 1],
        last_n_evals=3,
    )
    shutil.copy(f"{report_path}", ".")
shutil.rmtree("./cyclops_report")

100%|██████████| 9/9 [00:00<00:00, 11.69it/s]
