In [51]:
import pandas as pd
pd.set_option('display.max_columns', None)

train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [52]:
# Vamos a definir los datos de entrenamiento
X = train.drop(['mineralType', 'id'], axis=1)
y = train['mineralType']

In [53]:
X['planetSection'] = X['planetSection'].astype('category')
X['geoZone'] = X['geoZone'].astype('category')
X['rockSize'] = X['rockSize'].astype('category')
X['cover'] = X['cover'].astype('category')
X['magmaConcentrationDistance'] = X['magmaConcentrationDistance'].astype('category')

In [54]:
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy

# Create sklearn pipeline for data preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, precision_score
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import set_config
import numpy as np
import mlflow
import math
from joblib import Memory
from shutil import rmtree

mlflow.sklearn.autolog()
mlflow.set_experiment("autosklearn_preprocessor")

preprocessor = ColumnTransformer([
        # (name, transformer, columns)
        ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
        ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
        ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
        ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
        ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
        ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
        ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
        ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
        ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
        ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
        ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
        ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
        ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
    ],
    verbose_feature_names_out=False, remainder='passthrough'
)

def euclidean_distance(x, y):
    res = []
    for i in range(len(x)):
        res.append(math.sqrt(x[i]**2 + y[i]**2))
    return np.array(res)

class CreateVariables(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # waterSteamDistance
        waterStreamDistanceX = X[:,2]
        waterStreamDistanceY = X[:,3]
        waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
        X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

        # temperature (both planet rotations)
        temperatureFirstHalfPlanetRotation = X[:,0]
        temperatureSecondHalfPlanetRotation = X[:,1]
        meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
        X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

        return X

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('create_variables', CreateVariables()),
    # ('pca', PCA()),
    # ('model', AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=60,
    #             # include={
    #             #     'data_preprocessor': ['NoPreprocessing']
    #             # }
    #          )
    # )
])

cls = AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=300)
cls.fit(pipe.fit_transform(X), y)

2022/04/28 19:41:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5d93c32fde154c1eb2d253466cf2ddbb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                  transformers=[('temperatureFirstHalfPlanetRotation',
                                 Pipeline(steps=[('iterativeimputer',
                                             ...`
                  transformers=[('temperatureFirstHalfPlanetRotation',
                                 Pipeline(steps=[('iterativeimputer',
                                                  IterativeImpu...`
                ('functiontransformer',
                 FunctionTransformer(feature_names_out='one-to-one',
                   ...`
                ('functiontransformer',
                 FunctionTransformer(feature_names_out='one-to-one',
                                     func=<function <lambda...`
Perhaps you already have a cluste



AutoSklearn2Classifier(metric=accuracy, n_jobs=-1, per_run_time_limit=240,
                       time_left_for_this_task=300)

In [55]:
cls.sprint_statistics()

'auto-sklearn results:\n  Dataset name: 6d4e8dd2-c71a-11ec-a1f9-00155d6e34f2\n  Metric: accuracy\n  Best validation score: 0.758815\n  Number of target algorithm runs: 21\n  Number of successful target algorithm runs: 19\n  Number of crashed target algorithm runs: 0\n  Number of target algorithms that exceeded the time limit: 2\n  Number of target algorithms that exceeded the memory limit: 0\n'

In [56]:
cls.show_models()

{14: {'model_id': 14,
  'rank': 1,
  'cost': 0.2411853260438951,
  'ensemble_weight': 0.02,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7fadf52087d0>,
  'balancing': Balancing(random_state=1, strategy='weighting'),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7fadf539cfd0>,
  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7fadf5175650>,
  'sklearn_classifier': None},
 2: {'model_id': 2,
  'rank': 2,
  'cost': 0.25346644481419855,
  'ensemble_weight': 0.02,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7fadf4b00ad0>,
  'balancing': Balancing(random_state=1, strategy='weighting'),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7fadf5156b50>,
  'classifier': <autosklearn.pipeline.components.classification.Cla

In [57]:
cls.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14,1,0.02,extra_trees,0.241185,149.167808
2,2,0.02,extra_trees,0.253466,149.14467
17,3,0.02,gradient_boosting,0.280802,137.062096
9,4,0.94,sgd,0.482371,32.833851


In [58]:
pd.DataFrame(cls.cv_results_).sort_values(by=['rank_test_scores'], ascending=True).head(10)

Unnamed: 0,mean_test_score,mean_fit_time,params,rank_test_scores,status,budgets,param_balancing:strategy,param_classifier:__choice__,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,param_classifier:extra_trees:bootstrap,param_classifier:extra_trees:criterion,param_classifier:extra_trees:max_depth,param_classifier:extra_trees:max_features,param_classifier:extra_trees:max_leaf_nodes,param_classifier:extra_trees:min_impurity_decrease,param_classifier:extra_trees:min_samples_leaf,param_classifier:extra_trees:min_samples_split,param_classifier:extra_trees:min_weight_fraction_leaf,param_classifier:gradient_boosting:early_stop,param_classifier:gradient_boosting:l2_regularization,param_classifier:gradient_boosting:learning_rate,param_classifier:gradient_boosting:loss,param_classifier:gradient_boosting:max_bins,param_classifier:gradient_boosting:max_depth,param_classifier:gradient_boosting:max_leaf_nodes,param_classifier:gradient_boosting:min_samples_leaf,param_classifier:gradient_boosting:scoring,param_classifier:gradient_boosting:tol,param_classifier:mlp:activation,param_classifier:mlp:alpha,param_classifier:mlp:batch_size,param_classifier:mlp:beta_1,param_classifier:mlp:beta_2,param_classifier:mlp:early_stopping,param_classifier:mlp:epsilon,param_classifier:mlp:hidden_layer_depth,param_classifier:mlp:learning_rate_init,param_classifier:mlp:n_iter_no_change,param_classifier:mlp:num_nodes_per_layer,param_classifier:mlp:shuffle,param_classifier:mlp:solver,param_classifier:mlp:tol,param_classifier:passive_aggressive:C,param_classifier:passive_aggressive:average,param_classifier:passive_aggressive:fit_intercept,param_classifier:passive_aggressive:loss,param_classifier:passive_aggressive:tol,param_classifier:random_forest:bootstrap,param_classifier:random_forest:criterion,param_classifier:random_forest:max_depth,param_classifier:random_forest:max_features,param_classifier:random_forest:max_leaf_nodes,param_classifier:random_forest:min_impurity_decrease,param_classifier:random_forest:min_samples_leaf,param_classifier:random_forest:min_samples_split,param_classifier:random_forest:min_weight_fraction_leaf,param_classifier:sgd:alpha,param_classifier:sgd:average,param_classifier:sgd:fit_intercept,param_classifier:sgd:learning_rate,param_classifier:sgd:loss,param_classifier:sgd:penalty,param_classifier:sgd:tol,param_data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__,param_data_preprocessor:feature_type:numerical_transformer:imputation:strategy,param_data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__,param_classifier:gradient_boosting:n_iter_no_change,param_classifier:gradient_boosting:validation_fraction,param_classifier:mlp:validation_fraction,param_classifier:sgd:epsilon,param_classifier:sgd:eta0,param_classifier:sgd:l1_ratio,param_classifier:sgd:power_t,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:n_quantiles,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:output_distribution,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min
12,0.758815,149.167808,"{'balancing:strategy': 'weighting', 'classifie...",1,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,False,entropy,,0.958941,,0.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,mean,minmax,,,,,,,,0.003691,,,,
0,0.746534,149.14467,"{'balancing:strategy': 'weighting', 'classifie...",2,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,False,entropy,,0.95659,,0.0,4.0,15.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,most_frequent,robust_scaler,,,,,,,,0.118408,,,0.766873,0.286333
16,0.72934,137.190558,"{'balancing:strategy': 'none', 'classifier:__c...",3,Success,0.0,none,random_forest,feature_type,no_preprocessing,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,gini,,0.872766,,0.0,11.0,15.0,0.0,,,,,,,,one_hot_encoding,minority_coalescer,median,robust_scaler,,,,,,,,0.027147,,,0.730022,0.23992
15,0.719198,137.062096,"{'balancing:strategy': 'weighting', 'classifie...",4,Success,0.0,weighting,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,off,0.06413435,0.196618,auto,255.0,,103.0,1.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,median,power_transformer,,,,,,,,0.009713,,,,
1,0.718485,149.066342,"{'balancing:strategy': 'weighting', 'classifie...",5,Success,0.0,weighting,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,train,8.789671e-10,0.195957,auto,255.0,,10.0,2.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,median,none,13.0,,,,,,,0.001314,,,,
3,0.713018,149.092871,"{'balancing:strategy': 'weighting', 'classifie...",6,Success,0.0,weighting,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,valid,1.021041e-09,0.192989,auto,255.0,,10.0,20.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,median,quantile_transformer,6.0,0.115496,,,,,,,452.0,uniform,,
4,0.711196,149.069518,"{'balancing:strategy': 'weighting', 'classifie...",7,Success,0.0,weighting,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,valid,7.351154e-07,0.070436,auto,255.0,,698.0,7.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,median,normalize,1.0,0.162028,,,,,,,,,,
17,0.70351,134.124143,"{'balancing:strategy': 'weighting', 'classifie...",8,Success,0.0,weighting,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,valid,1.0197e-10,0.083654,auto,255.0,,10.0,41.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,no_coalescense,most_frequent,minmax,3.0,0.140534,,,,,,,,,,
13,0.643531,137.057894,"{'balancing:strategy': 'weighting', 'classifie...",9,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,True,entropy,,0.278607,,0.0,16.0,16.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,mean,robust_scaler,,,,,,,,,,,0.85636,0.041173
10,0.612788,121.032433,"{'balancing:strategy': 'none', 'classifier:__c...",10,Success,0.0,none,gradient_boosting,feature_type,no_preprocessing,,,,,,,,,,train,0.0001621317,0.79628,auto,255.0,,183.0,21.0,loss,1e-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,no_coalescense,median,quantile_transformer,3.0,,,,,,,,82.0,normal,,
