Assess Spaceship Titanic with CredoAI - Lens
============================================

In [1]:
import pandas as pd
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from credoai.lens import Lens
from credoai.artifacts import ClassificationModel, TabularData
from credoai.evaluators import ModelFairness, Performance


Load trained model
------------------


In [4]:
ROOT_PATH = './input_data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

train_data = pd.read_csv(ROOT_PATH+TRAIN_FILE)
test_data = pd.read_csv(ROOT_PATH+TEST_FILE)

In [5]:
def feature_engineering(data):
    return (
            data
            .assign(
                PassengerGGG = [x.split('_')[-0] for x in data['PassengerId']],
                PassengerPP = [x.split('_')[-1] for x in data['PassengerId']],
            )
            .drop(columns=['Name', 'PassengerId'])
        )

fe_eng = FunctionTransformer(feature_engineering)

fe_eng.fit_transform(train_data)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGGG,PassengerPP
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0001,01
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0002,01
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0003,01
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0003,02
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,0004,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,9276,01
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,9278,01
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,9279,01
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,9280,01


In [6]:
with open('model.jlb', 'rb') as file:
    model = joblib.load(file)

In [7]:
imputer = ColumnTransformer(
    [
        (
            'label_imputer',
            SimpleImputer(strategy='most_frequent'),
            ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGGG', 'PassengerPP']
        ),
        (
            'num_imputer',
            SimpleImputer(strategy='mean'),
            ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        )
    ],
    verbose_feature_names_out=False,
    remainder='drop'
).set_output(transform='pandas')

imputer.fit_transform(fe_eng.fit_transform(train_data))

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,PassengerGGG,PassengerPP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,False,0001,01,39.0,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,False,0002,01,24.0,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,True,0003,01,58.0,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,False,0003,02,33.0,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,False,0004,01,16.0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,True,9276,01,41.0,0.0,6819.0,0.0,1643.0,74.0
8689,Earth,True,PSO J318.5-22,False,9278,01,18.0,0.0,0.0,0.0,0.0,0.0
8690,Earth,False,TRAPPIST-1e,False,9279,01,26.0,0.0,0.0,1872.0,1.0,0.0
8691,Europa,False,55 Cancri e,False,9280,01,32.0,0.0,1049.0,0.0,353.0,3235.0


In [8]:
scale_encode = ColumnTransformer(
    [
        (
            'std_scaler',
            StandardScaler(),
            ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        ),
        (
            'minmax_scaler',
            MinMaxScaler(),
            ['PassengerGGG', 'PassengerPP']
        ),
        (
            'one_hot',
            OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'),
            ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
        )
    ],
    verbose_feature_names_out=False
).set_output(transform='pandas')

scale_encode.fit_transform(imputer.fit_transform(fe_eng.fit_transform(train_data)))

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGGG,PassengerPP,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,0.709437,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.336717,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,0.000108,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,2.034566,-0.275409,1.955616,-0.290817,5.694289,-0.225782,0.000216,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.290975,-0.340590,0.517406,0.330225,2.683471,-0.098708,0.000216,0.142857,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.894666,0.118709,-0.243409,-0.038048,0.225732,-0.267258,0.000323,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.848924,-0.340590,3.989682,-0.290817,1.184286,-0.203720,0.999569,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8689,-0.755179,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.999784,0.000000,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
8690,-0.197230,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,0.999892,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
8691,0.221232,-0.340590,0.370637,-0.290817,0.037223,2.585740,1.000000,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [9]:
preproc = Pipeline(
    [
        ('fe_eng', fe_eng),
        ('imputer', imputer),
        ('scale_encode', scale_encode)
    ]
).set_output(transform='pandas')

preproc.fit_transform(train_data)

With transform="pandas", `func` should return a DataFrame to follow the set_output API.


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGGG,PassengerPP,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,0.709437,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.336717,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,0.000108,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,2.034566,-0.275409,1.955616,-0.290817,5.694289,-0.225782,0.000216,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.290975,-0.340590,0.517406,0.330225,2.683471,-0.098708,0.000216,0.142857,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.894666,0.118709,-0.243409,-0.038048,0.225732,-0.267258,0.000323,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.848924,-0.340590,3.989682,-0.290817,1.184286,-0.203720,0.999569,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8689,-0.755179,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.999784,0.000000,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
8690,-0.197230,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,0.999892,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
8691,0.221232,-0.340590,0.370637,-0.290817,0.037223,2.585740,1.000000,0.000000,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
imputer = ColumnTransformer(
    [
        (
            'label_imputer',
            SimpleImputer(strategy='most_frequent'),
            ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGGG', 'PassengerPP']
        ),
        (
            'num_imputer',
            SimpleImputer(strategy='mean'),
            ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        )
    ],
    verbose_feature_names_out=False,
    remainder='drop'
).set_output(transform='pandas')

imputer.fit_transform(fe_eng.fit_transform(train_data))

In [None]:
credo_model = ClassificationModel(name="titanic_default_classifier", model_like=model)
credo_data = TabularData(
    name="titanic-default",
    X=train_data,
    y=train_data.Transported,
    sensitive_features=train_data.Age,
)
lens = Lens(model=credo_model, assessment_data=credo_data)
metrics = ['precision_score', 'recall_score', 'equal_opportunity']
lens.add(ModelFairness(metrics=metrics))
lens.add(Performance(metrics=metrics))
lens