In [53]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
cwd = Path.cwd()
ROOT_PATH = str(cwd.parent.parent)
sys.path.append(ROOT_PATH)
from simpml.tabular.all import *
np.random.seed(0)
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Tabular Unupervised Use Cases

## Anomaly Detection

### Data Manager

In [54]:
data = DataSet.load_titanic_dataset()

In [55]:
data_manager = UnsupervisedTabularDataManager(data = data,
                                            prediction_type = PredictionType.AnomalyDetection, 
                                            target = 'Survived',
                                            splitter=RandomSplitter(target='Survived', split_sets={Dataset.Train: 0.6, Dataset.Valid: 0.2, Dataset.Test: 0.2}, stratify=True))

In [56]:
data_manager.build_pipeline()

Sklearn Pipeline:
MatchVariablesBefore (MatchVariables(missing_values='ignore')) ->
NanColumnDropper (NanColumnDropper()) ->
Infinity2Nan (Infinity2Nan()) ->
MinMaxScaler (MinMaxScalerWithColumnNames()) ->
HighCardinalityDropper (HighCardinalityDropper()) ->
AddMissingIndicator (AddMissingIndicator()) ->
NumericalImputer (MeanMedianImputer()) ->
SafeCategoricalImputer (SafeCategoricalTransformer(transformer_cls=<class 'feature_engine.imputation.categorical.CategoricalImputer'>)) ->
SafeOneHotEncoder (SafeCategoricalTransformer(transformer_cls=<class 'feature_engine.encoding.one_hot.OneHotEncoder'>)) ->
MatchVariablesAfter (MatchVariables(missing_values='ignore'))

In [57]:
X, y = data_manager.get_training_data()
X

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age_na,Cabin_na,Sex_male,Sex_female,...,Cabin_F G63,Cabin_A14,Cabin_D49,Cabin_C87,Cabin_D56,Cabin_D35,Cabin_C62 C64,Embarked_S,Embarked_Q,Embarked_C
570,0.640449,0.5,0.773813,0.000,0.0,0.020495,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
787,0.884270,1.0,0.095250,0.500,0.2,0.056848,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
74,0.083146,1.0,0.396833,0.000,0.0,0.110272,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
113,0.126966,1.0,0.246042,0.125,0.0,0.019177,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
635,0.713483,0.5,0.346569,0.000,0.0,0.025374,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.119101,1.0,0.258608,0.000,0.0,0.014932,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
270,0.303371,0.0,0.346569,0.000,0.0,0.060508,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
860,0.966292,1.0,0.509927,0.250,0.0,0.027538,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
435,0.488764,0.0,0.170646,0.125,0.4,0.234224,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [58]:
X, y = data_manager.get_validation_data()
X

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Age_na,Cabin_na,Sex_male,Sex_female,...,Cabin_F G63,Cabin_A14,Cabin_D49,Cabin_C87,Cabin_D56,Cabin_D35,Cabin_C62 C64,Embarked_S,Embarked_Q,Embarked_C
849,0.953933,0.0,0.346569,0.125,0.0,0.173920,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
331,0.371910,0.0,0.566474,0.000,0.0,0.055628,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
260,0.292135,1.0,0.346569,0.000,0.0,0.015127,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
316,0.355056,0.5,0.296306,0.125,0.0,0.050749,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
292,0.328090,0.5,0.447097,0.000,0.0,0.025130,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,0.349438,0.0,0.220910,0.250,0.4,0.512122,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
670,0.752809,0.5,0.497361,0.125,0.2,0.076123,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
172,0.193258,1.0,0.007288,0.125,0.2,0.021731,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
587,0.659551,0.0,0.748681,0.125,0.2,0.154588,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


### Experiment Manager

In [59]:
exp_mang = ExperimentManager(data_manager, optimize_metric = MetricName.AUC)

In [60]:
exp_mang.display_models_pool()

Unnamed: 0,Name,Description,Source,Is Available
0,Isolation Forest,Default settings,Pool,True
1,Feature Bagging,Default settings,Pool,True
2,LODA,Default settings,Pool,True


In [61]:
exp_mang.display_metrics_pool()

Unnamed: 0,Name,Description,Source,Is Available,Is Optimal
0,AUC,,Pool,True,True
1,Recall,,Pool,True,False
2,Balanced Accuracy,,Pool,True,False
3,F1,,Pool,True,False
4,Precision,,Pool,True,False
5,Accuracy,,Pool,True,False


In [62]:
def find_contamination(data_manager):
    data_manager.get_training_data()
    value_counts = data_manager.get_validation_data()[1].value_counts()
    total_samples = value_counts.sum()
    return (value_counts / total_samples)[1]

In [63]:
contamination = find_contamination(data_manager)
exp_mang.run_experiment(models_kwargs = {'contamination': contamination})

Unnamed: 0,Experiment ID,Model,Description,Data Version,Data Description,Model Params,Metric Params,Accuracy,AUC,Recall,Precision,Balanced Accuracy,F1,Run Time
0,aef4ac05,Feature Bagging,Default settings,d497703a,,{'contamination': 0.3651685393258427},{},0.606742,0.494214,0.076923,0.333333,0.494214,0.125,0:00:00
1,aef4ac05,LODA,Default settings,d497703a,,{'contamination': 0.3651685393258427},{},0.651685,0.539415,0.123077,0.615385,0.539415,0.205128,0:00:00
2,aef4ac05,Isolation Forest,Default settings,d497703a,,{'contamination': 0.3651685393258427},{},0.651685,0.536147,0.107692,0.636364,0.536147,0.184211,0:00:00


In [64]:
exp_mang.get_best_model()

Model: LODA(contamination=0.1, n_bins=10, n_random_cuts=100), Description: Default settings

## Clustering

### Data Manager

In [65]:
data_manager = UnsupervisedTabularDataManager(data = 'datasets/binary/Titanic.csv',
                                            prediction_type = PredictionType.Clustering, 
                                            splitter='Random')

In [66]:
data_manager.build_pipeline()

Sklearn Pipeline:
MatchVariablesBefore (MatchVariables(missing_values='ignore')) ->
NanColumnDropper (NanColumnDropper()) ->
Infinity2Nan (Infinity2Nan()) ->
MinMaxScaler (MinMaxScalerWithColumnNames()) ->
HighCardinalityDropper (HighCardinalityDropper()) ->
AddMissingIndicator (AddMissingIndicator()) ->
NumericalImputer (MeanMedianImputer()) ->
SafeCategoricalImputer (SafeCategoricalTransformer(transformer_cls=<class 'feature_engine.imputation.categorical.CategoricalImputer'>)) ->
SafeOneHotEncoder (SafeCategoricalTransformer(transformer_cls=<class 'feature_engine.encoding.one_hot.OneHotEncoder'>)) ->
MatchVariablesAfter (MatchVariables(missing_values='ignore'))

### Experiment Manager

In [67]:
exp_mang = ExperimentManager(data_manager, optimize_metric = MetricName.CalinskiHarabasz)

In [68]:
exp_mang.display_models_pool()

Unnamed: 0,Name,Description,Source,Is Available
0,K-Means,Default settings,Pool,True
1,Gaussian Mixture,Default settings,Pool,True


In [69]:
exp_mang.display_metrics_pool()

Unnamed: 0,Name,Description,Source,Is Available,Is Optimal
0,Calinski-Harabasz Score,,Pool,True,True
1,Silhouette Score,,Pool,True,False
2,Davies-Bouldin Score,,Pool,True,False


In [70]:
exp_mang.run_experiment()

Unnamed: 0,Experiment ID,Model,Description,Data Version,Data Description,Model Params,Metric Params,Silhouette Score,Davies-Bouldin Score,Calinski-Harabasz Score,Run Time
0,42909183,K-Means,Default settings,bf2c1a54,,{},{},0.317361,1.44192,41.77338,0:00:00
1,42909183,Gaussian Mixture,Default settings,bf2c1a54,,{},{},0.309324,1.536772,51.01327,0:00:00


In [71]:
exp_mang.get_best_model()

Model: GaussianMixture(n_components=2), Description: Default settings