In [5]:
import polars as pl
import numpy as np
import pandas as pd
import altair as alt
import os
import wget
import zipfile

from os.path import exists
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

from mapie.classification import MapieClassifier
from mapie.metrics import classification_coverage_score
from mapie.metrics import classification_coverage_score_v2
from mapie.metrics import classification_mean_width_score

In [2]:
beans = "./DryBeanDataset/Dry_Bean_Dataset.xlsx"
base = "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset_number = "00602"

if not exists(beans):
    filename = "DryBeanDataset.zip"
    url = f"{base}{dataset_number}/{filename}"
    wget.download(url)
    with zipfile.ZipFile(filename, "r") as zip_ref:
        zip_ref.extractall('./')
    os.remove(filename)

bean_df = pl.read_excel("./DryBeanDataset/Dry_Bean_Dataset.xlsx")
bean_df = bean_df.to_pandas()
le = LabelEncoder()
bean_df["Class"] = le.fit_transform(bean_df["Class"])

y = pl.Series(bean_df["Class"])
X = pl.DataFrame(bean_df.drop("Class", axis=1))
X_train, X_rest1, y_train, y_rest1 = train_test_split(X, y, train_size=10000, random_state=2)
X_test, X_rest2, y_test, y_rest2 = train_test_split(X_rest1, y_rest1, train_size=1000, random_state=42)
X_calib, X_new, y_calib, y_new = train_test_split(X_rest2, y_rest2, train_size=1000, random_state=42)

In [3]:
model = GaussianNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

The general guideline (with Mapie it would be the method score) is the following:
- split your data into train, calibration and test (sometimes called new)
- train the model on the training data (which is sometimes internally splitted into validation sets, depending on the approach)
- Compute Non-Conformity Scores, that means we use def a predict proba method to predict probabilties for a class of the model, 
calculate the s score for the true label of a class, which is (1 - probability for the true-label)
sort the scores from low to High, compute the threshold where 9% % of the s scores are smaller than 95 % of probas.
- Calculate the quantile level and correct it through the sample size of the calibration set, only needed for small sample sizes
- calculare the qlevel quantile of the q scores
- take your test/new data and compute s for all classey of y by again making predict proba. To decide what classes out of a set of classes like    
lion, mouse, dog, cat make it into the prediction set ( to reflect it is not a binary choice but sometimes a mutliple choice), do it like this.
- the decision process is done with a threshold, and this treshold comes from the calibration set. It can neither be calculated on train set nor on the test set itself, The calibration threshold comess from the calibration set                                                                                           

In Mapie we can use the integrated score method to cover this

In [16]:
mapie_score = MapieClassifier(model, cv="prefit", method="score")
# replace with an SKlearn FOld
# "Jackknife+" and "CV+": MAPIE's "Jackknife+" and "CV+" methods (which use a different form of cross-validation) are generally considered more reliable than the cv="prefit" approach, although they can be computationally more expensive.

In [17]:
mapie_score.fit(X_calib, y_calib)

In [18]:
y_pred, y_set = mapie_score.predict(X_new, alpha=0.05)
y_set = np.squeeze(y_set)

In [19]:
classification_coverage_score(y_new, y_set)

0.962756052141527

In [20]:
setsize = classification_mean_width_score(y_set)
setsize

1.8317815021725636