This notebook is inspired by the work of CEM (Zarlenga et al. 2022) and CBM (Koh et al. 2020) papers. Please visit their GitHub repositories:
[CEM GitHub](https://github.com/mateoespinosa/cem) and [CBM GitHub](https://github.com/yewsiang/ConceptBottleneck).

# CBM: buggy model metrics - confounded and non-confounded data

There are four main steps:
1. Loading the dataset.
2. Initializing a CBM with InceptionV3 vision backbone for the dataset.
3. Load CBMs
4. Evaluate the models

## Step 1: Load Data

The first step is to load the data. The designed CBM class with the PyTorch Lightning Trainer takes in PyTorch DataLoader object.
Furthermore, it needs to contain three elements (in the following order):
1. the sample image, $\mathbf{x}$
2. the image label, $\mathbf{y}$
3. the concept labels, in binary format, $\mathbf{c}$

In [None]:
#from cub_data_module import *
import logging
import torch
import numpy as np
import yaml
import torchvision.models as models
import pytorch_lightning as pl
from scipy.special import softmax
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm

In [None]:
#import sys
#sys.path.append("/kaggle/usr/lib/cub_data_module_NO_SHUFFLE") #CHANGED THIS FOR NCF TRAINING METRICS
#sys.path.append("/kaggle/usr/lib/cub_data_module_NO_SHUFFLE_CF") #CHANGED THIS FOR NCF TRAINING METRICS


#import cub_data_module_no_shuffle as cub_data_module #for evaluating the unconfounded sets
import cub_data_module_no_shuffle_cf as cub_data_module #for evaluating the confounded set
#Make sure to comment the the test and validation metrics at the end of this notebook
#if you want to evaluate the confounded train set.

In [None]:
def _update_config_with_dataset(
    config,
    train_dl,
    n_concepts,
    n_tasks,
    concept_map,
):
    config["n_concepts"] = config.get(
        "n_concepts",
        n_concepts,
    )
    config["n_tasks"] = config.get(
        "n_tasks",
        n_tasks,
    )
    config["concept_map"] = config.get(
        "concept_map",
        concept_map,
    )

    task_class_weights = None

    if config.get('use_task_class_weights', False):
        logging.info(
            f"Computing task class weights in the training dataset with "
            f"size {len(train_dl)}..."
        )
        attribute_count = np.zeros((max(n_tasks, 2),))
        samples_seen = 0
        for i, data in enumerate(train_dl):
            if len(data) == 2:
                (_, (y, _)) = data
            else:
                (_, y, _) = data
            if n_tasks > 1:
                y = torch.nn.functional.one_hot(
                    y,
                    num_classes=n_tasks,
                ).cpu().detach().numpy()
            else:
                y = torch.cat(
                    [torch.unsqueeze(1 - y, dim=-1), torch.unsqueeze(y, dim=-1)],
                    dim=-1,
                ).cpu().detach().numpy()
            attribute_count += np.sum(y, axis=0)
            samples_seen += y.shape[0]
        print("Class distribution is:", attribute_count / samples_seen)
        if n_tasks > 1:
            task_class_weights = samples_seen / attribute_count - 1
        else:
            task_class_weights = np.array(
                [attribute_count[0]/attribute_count[1]]
            )
    return task_class_weights

In [None]:
def _generate_dataset_and_update_config(
    experiment_config
):
    if experiment_config.get("dataset_config", None) is None:
        raise ValueError(
            "A dataset_config must be provided for each experiment run!"
        )

    dataset_config = experiment_config['dataset_config']
    logging.debug(
        f"The dataset's root directory is {dataset_config.get('root_dir')}"
    )
    intervention_config = experiment_config.get('intervention_config', {})
    if dataset_config["dataset"] == "cub":
        data_module = cub_data_module
    else:
        raise ValueError(f"Unsupported dataset {dataset_config['dataset']}!")

    train_dl, val_dl, test_dl, imbalance, (n_concepts, n_tasks, concept_map) = \
        data_module.generate_data(
            config=dataset_config,
            seed=42,
            output_dataset_vars=True,
            root_dir=dataset_config.get('root_dir', None),
            model_inspection=False,
        )
    # For now, we assume that all concepts have the same
    # aquisition cost
    acquisition_costs = None
    if concept_map is not None:
        intervened_groups = list(
            range(
                0,
                len(concept_map) + 1,
                intervention_config.get('intervention_freq', 1),
            )
        )
    else:
        intervened_groups = list(
            range(
                0,
                n_concepts + 1,
                intervention_config.get('intervention_freq', 1),
            )
        )

    task_class_weights = _update_config_with_dataset(
        config=experiment_config,
        train_dl=train_dl,
        n_concepts=n_concepts,
        n_tasks=n_tasks,
        concept_map=concept_map,
    )
    return (
        train_dl,
        val_dl,
        test_dl,
        imbalance,
        concept_map,
        intervened_groups,
        task_class_weights,
        acquisition_costs,
    )

In [None]:
yaml_path = "data/cub.yaml" # for local development, might need to use whole path.

with open(yaml_path, "r") as file:
    yaml_config = yaml.safe_load(file)
yaml_config["shared_params"]["dataset_config"]["root_dir"] = "/kaggle/input/cem-cub2000-filtered/" #for Kaggle, replace this with locally downloaded folder.
yaml_config["shared_params"]["dataset_config"]["num_workers"] = 4 #change depending on resources available.
yaml_config["shared_params"]["dataset_config"]["batch_size"] = 64 #change depending on resources available.

In [None]:
train_dl, val_dl, test_dl, imbalance, concept_map, intervened_groups, task_class_weights, acquisition_costs = _generate_dataset_and_update_config(yaml_config["shared_params"])

## Step 2: Create the CBM
### Step 2.1 Define model for input to concepts
We first need to define a architecture that will extract the concepts from the input image.

For this, we used a pre-trained InceptionV3 model. We remove the last linear layer and make one that we can use for our task, so it is ready for fine-tuning.

In [None]:
def latent_code_generator_model(output_dim=112):
    # Load pre-trained InceptionV3
    inception = models.inception_v3(weights=models.Inception_V3_Weights.DEFAULT)

    # Remove auxiliary classifier (set to None)
    inception.aux_logits = False  # Disable aux_logits
    inception.AuxLogits = None  # Delete aux classifier branch

    inception.fc = torch.nn.Linear(2048, output_dim)  # Replace classification layer with output_dim

    return inception

### Step 2.2: define CBM model.
We need to define the following:
1. `n_concepts`: the number of concepts in the dataset (112).
2. `n_tasks`: the number of output labels in the dataset (200).
3. `concept_loss_weight`: the weight to use for the concept prediction loss during training of the CBM. Picked to be the same as the CEM paper.
4. `learning_rate` and `optimizer`: to use during training. Optimizer is Adam by default, otherwise SGD.
5. `c_extractor_arch`: the model architecture to use for going from the input space to the concepts.
6. `c2y_model` and `c2y_layers`: the model architecture to use for going from the concepts to the labels. It can be directly the model, like c_extractor_arch or the layers as a list. We choose to do a simple linear layer.

In [None]:
from src.utils_cbm import *
from src.cbm import ConceptBottleneckModel

## Step 3: Load the CBMs

Now that we have both the dataset and the model defined, we can train our CEM
using Pytorch Lightning's wrappers for ease. This should be very simple via
Pytorch Lightning's `Trainer` once the data has been generated:

For more details on all the things you may add/configure to the Trainer for more
control, please refer to the [official documentation](https://lightning.ai/docs/pytorch/stable/common/trainer.html).

In [None]:
def load_model(typ):
    cbm_model_new = ConceptBottleneckModel.load_from_checkpoint(
        checkpoint_path=f"/kaggle/input/oracles-v2/{typ}.ckpt",
        n_concepts=112,
        n_tasks=200,
        concept_loss_weight=yaml_config["shared_params"]["concept_loss_weight"],
        learning_rate=yaml_config["shared_params"]["learning_rate"],  # The learning rate to use during training.
        optimizer="sgd",
        c_extractor_arch=latent_code_generator_model, # Here we provide our generating function for the latent code generator model.
        c2y_model=None,
    )
    
    return cbm_model_new

## Step 4: evaluation
Now, we evaluate the model

In [None]:
# Before anything, however, let's get the underlying numpy arrays of our
# test dataset as they will be easier to work with
x_test, y_test, c_test = [], [], []
for (x, y, c) in tqdm(test_dl):
    x_test.append(x)
    y_test.append(y)
    c_test.append(c)
x_test = np.concatenate(x_test, axis=0)
y_test = np.concatenate(y_test, axis=0)
c_test = np.concatenate(c_test, axis=0)

x_train, y_train, c_train = [], [], []
for (x, y, c) in tqdm(train_dl):
    x_train.append(x)
    y_train.append(y)
    c_train.append(c)
x_train = np.concatenate(x_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
c_train = np.concatenate(c_train, axis=0)

x_val, y_val, c_val = [], [], []
for (x, y, c) in tqdm(val_dl):
    x_val.append(x)
    y_val.append(y)
    c_val.append(c)
x_val = np.concatenate(x_val, axis=0)
y_val = np.concatenate(y_val, axis=0)
c_val = np.concatenate(c_val, axis=0)

In [None]:
def evaluate_model(dataloader, cbm, x, y, c):
    #Now we are ready to generate the concept, label, and embedding predictions for
    #the test set using our trained CEM:

    # We will use a Trainer object to run inference in batches over our test
    # dataset
    trainer_inference = pl.Trainer(
        accelerator="gpu",
        devices="auto",
        logger=False, # No logs to be dumped for this trainer
    )
    batch_results = trainer_inference.predict(cbm, dataloader)
    
    # Then we combine all results into numpy arrays by joining over the batch
    # dimension
    c_pred = np.concatenate(
        list(map(lambda x: x[0].detach().cpu().numpy(), batch_results)),
        axis=0,
    )
    c_embs = np.concatenate(
        list(map(lambda x: x[1].detach().cpu().numpy(), batch_results)),
        axis=0,
    )
    # Reshape them so that we have embeddings (batch_size, k, emb_size)
    c_embs = np.reshape(c_embs, (c.shape[0], c.shape[1], -1))
    
    y_pred = np.concatenate(
        list(map(lambda x: x[2].detach().cpu().numpy(), batch_results)),
        axis=0,
    )

    ##And compute all the metrics of interest:
    # To match the dimensions of y_pred (5794, 200), and y (5794,):
    # We need to apply a softmax layer to get the class probabilities
    y_prob = softmax(y_pred, axis=1)
    
    # Then we get the highest probability for the classes
    y_pred_classes = np.argmax(y_prob, axis=1)  # Shape (5794,)



    ##########
    ## Compute test task accuracy
    ##########
    task_accuracy = accuracy_score(y, y_pred_classes)
    print(f"Our CBM's test task accuracy is {task_accuracy*100:.2f}%")

    ##########
    ## Compute test concept AUC
    ##########
    concept_auc = roc_auc_score(c, c_pred)
    print(f"Our CBM's test concept AUC is {concept_auc*100:.2f}%")

    return y_pred_classes, c_pred

In [None]:
for i in range(10, 59, 10):
    typ = f"cmb_{i}"
    cbm = load_model(typ)
    print(f"Model with epochs {i}")
    print("TRAIN")
    y_pred_classes, c_pred = evaluate_model(train_dl, cbm, x_train, y_train, c_train)
    #uncomment to see the results for the non-confounded test, val set
    #print("TEST")
    #y_pred_classes, c_pred = evaluate_model(test_dl, cbm, x_test, y_test, c_test)
    #print("VAL")
    #y_pred_classes, c_pred = evaluate_model(val_dl, cbm, x_val, y_val, c_val)
    print("="*80)